]> git.proxmox.com Git - mirror_qemu.git/blame - block.c
bsd-user: Fix syscall format, add strace support for more syscalls
[mirror_qemu.git] / block.c
CommitLineData
fc01f7e7
FB
1/*
2 * QEMU System Emulator block driver
5fafdf24 3 *
fc01f7e7 4 * Copyright (c) 2003 Fabrice Bellard
5fafdf24 5 *
fc01f7e7
FB
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
3990d09a 24#include "config-host.h"
faf07963 25#include "qemu-common.h"
6d519a5f 26#include "trace.h"
83c9089e 27#include "monitor/monitor.h"
737e150e
PB
28#include "block/block_int.h"
29#include "block/blockjob.h"
1de7afc9 30#include "qemu/module.h"
7b1b5d19 31#include "qapi/qmp/qjson.h"
9c17d615 32#include "sysemu/sysemu.h"
1de7afc9 33#include "qemu/notify.h"
737e150e 34#include "block/coroutine.h"
c13163fb 35#include "block/qapi.h"
b2023818 36#include "qmp-commands.h"
1de7afc9 37#include "qemu/timer.h"
fc01f7e7 38
71e72a19 39#ifdef CONFIG_BSD
7674e7bf
FB
40#include <sys/types.h>
41#include <sys/stat.h>
42#include <sys/ioctl.h>
72cf2d4f 43#include <sys/queue.h>
c5e97233 44#ifndef __DragonFly__
7674e7bf
FB
45#include <sys/disk.h>
46#endif
c5e97233 47#endif
7674e7bf 48
49dc768d
AL
49#ifdef _WIN32
50#include <windows.h>
51#endif
52
e4654d2d
FZ
53struct BdrvDirtyBitmap {
54 HBitmap *bitmap;
55 QLIST_ENTRY(BdrvDirtyBitmap) list;
56};
57
1c9805a3
SH
58#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
59
7d4b4ba5 60static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
f141eafe
AL
61static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
c87c0672 63 BlockDriverCompletionFunc *cb, void *opaque);
f141eafe
AL
64static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 66 BlockDriverCompletionFunc *cb, void *opaque);
f9f05dc5
KW
67static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68 int64_t sector_num, int nb_sectors,
69 QEMUIOVector *iov);
70static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71 int64_t sector_num, int nb_sectors,
72 QEMUIOVector *iov);
775aa8b6
KW
73static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
470c0504 75 BdrvRequestFlags flags);
775aa8b6
KW
76static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
f08f2dda 78 BdrvRequestFlags flags);
b2a61371
SH
79static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80 int64_t sector_num,
81 QEMUIOVector *qiov,
82 int nb_sectors,
d20d9b7c 83 BdrvRequestFlags flags,
b2a61371
SH
84 BlockDriverCompletionFunc *cb,
85 void *opaque,
8c5873d6 86 bool is_write);
b2a61371 87static void coroutine_fn bdrv_co_do_rw(void *opaque);
621f0589 88static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
aa7bfbff 89 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
ec530c81 90
1b7bdbc1
SH
91static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92 QTAILQ_HEAD_INITIALIZER(bdrv_states);
7ee930d0 93
dc364f4c
BC
94static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96
8a22f02a
SH
97static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98 QLIST_HEAD_INITIALIZER(bdrv_drivers);
ea2384d3 99
eb852011
MA
100/* If non-zero, use only whitelisted block drivers */
101static int use_bdrv_whitelist;
102
9e0b22f4
SH
103#ifdef _WIN32
104static int is_windows_drive_prefix(const char *filename)
105{
106 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108 filename[1] == ':');
109}
110
111int is_windows_drive(const char *filename)
112{
113 if (is_windows_drive_prefix(filename) &&
114 filename[2] == '\0')
115 return 1;
116 if (strstart(filename, "\\\\.\\", NULL) ||
117 strstart(filename, "//./", NULL))
118 return 1;
119 return 0;
120}
121#endif
122
0563e191 123/* throttling disk I/O limits */
cc0681c4
BC
124void bdrv_set_io_limits(BlockDriverState *bs,
125 ThrottleConfig *cfg)
98f90dba 126{
cc0681c4 127 int i;
98f90dba 128
cc0681c4 129 throttle_config(&bs->throttle_state, cfg);
98f90dba 130
cc0681c4
BC
131 for (i = 0; i < 2; i++) {
132 qemu_co_enter_next(&bs->throttled_reqs[i]);
98f90dba 133 }
cc0681c4
BC
134}
135
136/* this function drain all the throttled IOs */
137static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138{
139 bool drained = false;
140 bool enabled = bs->io_limits_enabled;
141 int i;
142
143 bs->io_limits_enabled = false;
144
145 for (i = 0; i < 2; i++) {
146 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147 drained = true;
148 }
149 }
150
151 bs->io_limits_enabled = enabled;
98f90dba 152
cc0681c4 153 return drained;
98f90dba
ZYW
154}
155
cc0681c4 156void bdrv_io_limits_disable(BlockDriverState *bs)
0563e191 157{
cc0681c4 158 bs->io_limits_enabled = false;
0563e191 159
cc0681c4
BC
160 bdrv_start_throttled_reqs(bs);
161
162 throttle_destroy(&bs->throttle_state);
0563e191
ZYW
163}
164
cc0681c4 165static void bdrv_throttle_read_timer_cb(void *opaque)
0563e191 166{
cc0681c4
BC
167 BlockDriverState *bs = opaque;
168 qemu_co_enter_next(&bs->throttled_reqs[0]);
0563e191
ZYW
169}
170
cc0681c4 171static void bdrv_throttle_write_timer_cb(void *opaque)
0563e191 172{
cc0681c4
BC
173 BlockDriverState *bs = opaque;
174 qemu_co_enter_next(&bs->throttled_reqs[1]);
0563e191
ZYW
175}
176
cc0681c4
BC
177/* should be called before bdrv_set_io_limits if a limit is set */
178void bdrv_io_limits_enable(BlockDriverState *bs)
179{
180 assert(!bs->io_limits_enabled);
181 throttle_init(&bs->throttle_state,
13af91eb 182 bdrv_get_aio_context(bs),
cc0681c4
BC
183 QEMU_CLOCK_VIRTUAL,
184 bdrv_throttle_read_timer_cb,
185 bdrv_throttle_write_timer_cb,
186 bs);
187 bs->io_limits_enabled = true;
188}
189
190/* This function makes an IO wait if needed
191 *
192 * @nb_sectors: the number of sectors of the IO
193 * @is_write: is the IO a write
194 */
98f90dba 195static void bdrv_io_limits_intercept(BlockDriverState *bs,
d5103588 196 unsigned int bytes,
cc0681c4 197 bool is_write)
98f90dba 198{
cc0681c4
BC
199 /* does this io must wait */
200 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
98f90dba 201
cc0681c4
BC
202 /* if must wait or any request of this type throttled queue the IO */
203 if (must_wait ||
204 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
205 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
98f90dba
ZYW
206 }
207
cc0681c4 208 /* the IO will be executed, do the accounting */
d5103588
KW
209 throttle_account(&bs->throttle_state, is_write, bytes);
210
98f90dba 211
cc0681c4
BC
212 /* if the next request must wait -> do nothing */
213 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
214 return;
98f90dba
ZYW
215 }
216
cc0681c4
BC
217 /* else queue next request for execution */
218 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
98f90dba
ZYW
219}
220
339064d5
KW
221size_t bdrv_opt_mem_align(BlockDriverState *bs)
222{
223 if (!bs || !bs->drv) {
224 /* 4k should be on the safe side */
225 return 4096;
226 }
227
228 return bs->bl.opt_mem_alignment;
229}
230
9e0b22f4
SH
231/* check if the path starts with "<protocol>:" */
232static int path_has_protocol(const char *path)
233{
947995c0
PB
234 const char *p;
235
9e0b22f4
SH
236#ifdef _WIN32
237 if (is_windows_drive(path) ||
238 is_windows_drive_prefix(path)) {
239 return 0;
240 }
947995c0
PB
241 p = path + strcspn(path, ":/\\");
242#else
243 p = path + strcspn(path, ":/");
9e0b22f4
SH
244#endif
245
947995c0 246 return *p == ':';
9e0b22f4
SH
247}
248
83f64091 249int path_is_absolute(const char *path)
3b0d4f61 250{
21664424
FB
251#ifdef _WIN32
252 /* specific case for names like: "\\.\d:" */
f53f4da9 253 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
21664424 254 return 1;
f53f4da9
PB
255 }
256 return (*path == '/' || *path == '\\');
3b9f94e1 257#else
f53f4da9 258 return (*path == '/');
3b9f94e1 259#endif
3b0d4f61
FB
260}
261
83f64091
FB
262/* if filename is absolute, just copy it to dest. Otherwise, build a
263 path to it by considering it is relative to base_path. URL are
264 supported. */
265void path_combine(char *dest, int dest_size,
266 const char *base_path,
267 const char *filename)
3b0d4f61 268{
83f64091
FB
269 const char *p, *p1;
270 int len;
271
272 if (dest_size <= 0)
273 return;
274 if (path_is_absolute(filename)) {
275 pstrcpy(dest, dest_size, filename);
276 } else {
277 p = strchr(base_path, ':');
278 if (p)
279 p++;
280 else
281 p = base_path;
3b9f94e1
FB
282 p1 = strrchr(base_path, '/');
283#ifdef _WIN32
284 {
285 const char *p2;
286 p2 = strrchr(base_path, '\\');
287 if (!p1 || p2 > p1)
288 p1 = p2;
289 }
290#endif
83f64091
FB
291 if (p1)
292 p1++;
293 else
294 p1 = base_path;
295 if (p1 > p)
296 p = p1;
297 len = p - base_path;
298 if (len > dest_size - 1)
299 len = dest_size - 1;
300 memcpy(dest, base_path, len);
301 dest[len] = '\0';
302 pstrcat(dest, dest_size, filename);
3b0d4f61 303 }
3b0d4f61
FB
304}
305
dc5a1371
PB
306void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
307{
308 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
309 pstrcpy(dest, sz, bs->backing_file);
310 } else {
311 path_combine(dest, sz, bs->filename, bs->backing_file);
312 }
313}
314
5efa9d5a 315void bdrv_register(BlockDriver *bdrv)
ea2384d3 316{
8c5873d6
SH
317 /* Block drivers without coroutine functions need emulation */
318 if (!bdrv->bdrv_co_readv) {
f9f05dc5
KW
319 bdrv->bdrv_co_readv = bdrv_co_readv_em;
320 bdrv->bdrv_co_writev = bdrv_co_writev_em;
321
f8c35c1d
SH
322 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
323 * the block driver lacks aio we need to emulate that too.
324 */
f9f05dc5
KW
325 if (!bdrv->bdrv_aio_readv) {
326 /* add AIO emulation layer */
327 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
328 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
f9f05dc5 329 }
83f64091 330 }
b2e12bc6 331
8a22f02a 332 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
ea2384d3 333}
b338082b
FB
334
335/* create a new block device (by default it is empty) */
98522f63 336BlockDriverState *bdrv_new(const char *device_name, Error **errp)
b338082b 337{
1b7bdbc1 338 BlockDriverState *bs;
fbe40ff7 339 int i;
b338082b 340
f2d953ec
KW
341 if (bdrv_find(device_name)) {
342 error_setg(errp, "Device with id '%s' already exists",
343 device_name);
344 return NULL;
345 }
346 if (bdrv_find_node(device_name)) {
347 error_setg(errp, "Device with node-name '%s' already exists",
348 device_name);
349 return NULL;
350 }
351
7267c094 352 bs = g_malloc0(sizeof(BlockDriverState));
e4654d2d 353 QLIST_INIT(&bs->dirty_bitmaps);
b338082b 354 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
ea2384d3 355 if (device_name[0] != '\0') {
dc364f4c 356 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
ea2384d3 357 }
fbe40ff7
FZ
358 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
359 QLIST_INIT(&bs->op_blockers[i]);
360 }
28a7282a 361 bdrv_iostatus_disable(bs);
d7d512f6 362 notifier_list_init(&bs->close_notifiers);
d616b224 363 notifier_with_return_list_init(&bs->before_write_notifiers);
cc0681c4
BC
364 qemu_co_queue_init(&bs->throttled_reqs[0]);
365 qemu_co_queue_init(&bs->throttled_reqs[1]);
9fcb0251 366 bs->refcnt = 1;
dcd04228 367 bs->aio_context = qemu_get_aio_context();
d7d512f6 368
b338082b
FB
369 return bs;
370}
371
d7d512f6
PB
372void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
373{
374 notifier_list_add(&bs->close_notifiers, notify);
375}
376
ea2384d3
FB
377BlockDriver *bdrv_find_format(const char *format_name)
378{
379 BlockDriver *drv1;
8a22f02a
SH
380 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
381 if (!strcmp(drv1->format_name, format_name)) {
ea2384d3 382 return drv1;
8a22f02a 383 }
ea2384d3
FB
384 }
385 return NULL;
386}
387
b64ec4e4 388static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
eb852011 389{
b64ec4e4
FZ
390 static const char *whitelist_rw[] = {
391 CONFIG_BDRV_RW_WHITELIST
392 };
393 static const char *whitelist_ro[] = {
394 CONFIG_BDRV_RO_WHITELIST
eb852011
MA
395 };
396 const char **p;
397
b64ec4e4 398 if (!whitelist_rw[0] && !whitelist_ro[0]) {
eb852011 399 return 1; /* no whitelist, anything goes */
b64ec4e4 400 }
eb852011 401
b64ec4e4 402 for (p = whitelist_rw; *p; p++) {
eb852011
MA
403 if (!strcmp(drv->format_name, *p)) {
404 return 1;
405 }
406 }
b64ec4e4
FZ
407 if (read_only) {
408 for (p = whitelist_ro; *p; p++) {
409 if (!strcmp(drv->format_name, *p)) {
410 return 1;
411 }
412 }
413 }
eb852011
MA
414 return 0;
415}
416
b64ec4e4
FZ
417BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
418 bool read_only)
eb852011
MA
419{
420 BlockDriver *drv = bdrv_find_format(format_name);
b64ec4e4 421 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
eb852011
MA
422}
423
5b7e1542
ZYW
424typedef struct CreateCo {
425 BlockDriver *drv;
426 char *filename;
427 QEMUOptionParameter *options;
428 int ret;
cc84d90f 429 Error *err;
5b7e1542
ZYW
430} CreateCo;
431
432static void coroutine_fn bdrv_create_co_entry(void *opaque)
433{
cc84d90f
HR
434 Error *local_err = NULL;
435 int ret;
436
5b7e1542
ZYW
437 CreateCo *cco = opaque;
438 assert(cco->drv);
439
cc84d90f 440 ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
84d18f06 441 if (local_err) {
cc84d90f
HR
442 error_propagate(&cco->err, local_err);
443 }
444 cco->ret = ret;
5b7e1542
ZYW
445}
446
0e7e1989 447int bdrv_create(BlockDriver *drv, const char* filename,
cc84d90f 448 QEMUOptionParameter *options, Error **errp)
ea2384d3 449{
5b7e1542
ZYW
450 int ret;
451
452 Coroutine *co;
453 CreateCo cco = {
454 .drv = drv,
455 .filename = g_strdup(filename),
456 .options = options,
457 .ret = NOT_DONE,
cc84d90f 458 .err = NULL,
5b7e1542
ZYW
459 };
460
461 if (!drv->bdrv_create) {
cc84d90f 462 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
80168bff
LC
463 ret = -ENOTSUP;
464 goto out;
5b7e1542
ZYW
465 }
466
467 if (qemu_in_coroutine()) {
468 /* Fast-path if already in coroutine context */
469 bdrv_create_co_entry(&cco);
470 } else {
471 co = qemu_coroutine_create(bdrv_create_co_entry);
472 qemu_coroutine_enter(co, &cco);
473 while (cco.ret == NOT_DONE) {
474 qemu_aio_wait();
475 }
476 }
477
478 ret = cco.ret;
cc84d90f 479 if (ret < 0) {
84d18f06 480 if (cco.err) {
cc84d90f
HR
481 error_propagate(errp, cco.err);
482 } else {
483 error_setg_errno(errp, -ret, "Could not create image");
484 }
485 }
0e7e1989 486
80168bff
LC
487out:
488 g_free(cco.filename);
5b7e1542 489 return ret;
ea2384d3
FB
490}
491
cc84d90f
HR
492int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
493 Error **errp)
84a12e66
CH
494{
495 BlockDriver *drv;
cc84d90f
HR
496 Error *local_err = NULL;
497 int ret;
84a12e66 498
98289620 499 drv = bdrv_find_protocol(filename, true);
84a12e66 500 if (drv == NULL) {
cc84d90f 501 error_setg(errp, "Could not find protocol for file '%s'", filename);
16905d71 502 return -ENOENT;
84a12e66
CH
503 }
504
cc84d90f 505 ret = bdrv_create(drv, filename, options, &local_err);
84d18f06 506 if (local_err) {
cc84d90f
HR
507 error_propagate(errp, local_err);
508 }
509 return ret;
84a12e66
CH
510}
511
355ef4ac 512int bdrv_refresh_limits(BlockDriverState *bs)
d34682cd
KW
513{
514 BlockDriver *drv = bs->drv;
515
516 memset(&bs->bl, 0, sizeof(bs->bl));
517
466ad822
KW
518 if (!drv) {
519 return 0;
520 }
521
522 /* Take some limits from the children as a default */
523 if (bs->file) {
524 bdrv_refresh_limits(bs->file);
525 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
339064d5
KW
526 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
527 } else {
528 bs->bl.opt_mem_alignment = 512;
466ad822
KW
529 }
530
531 if (bs->backing_hd) {
532 bdrv_refresh_limits(bs->backing_hd);
533 bs->bl.opt_transfer_length =
534 MAX(bs->bl.opt_transfer_length,
535 bs->backing_hd->bl.opt_transfer_length);
339064d5
KW
536 bs->bl.opt_mem_alignment =
537 MAX(bs->bl.opt_mem_alignment,
538 bs->backing_hd->bl.opt_mem_alignment);
466ad822
KW
539 }
540
541 /* Then let the driver override it */
542 if (drv->bdrv_refresh_limits) {
d34682cd
KW
543 return drv->bdrv_refresh_limits(bs);
544 }
545
546 return 0;
547}
548
eba25057
JM
549/*
550 * Create a uniquely-named empty temporary file.
551 * Return 0 upon success, otherwise a negative errno value.
552 */
553int get_tmp_filename(char *filename, int size)
d5249393 554{
eba25057 555#ifdef _WIN32
3b9f94e1 556 char temp_dir[MAX_PATH];
eba25057
JM
557 /* GetTempFileName requires that its output buffer (4th param)
558 have length MAX_PATH or greater. */
559 assert(size >= MAX_PATH);
560 return (GetTempPath(MAX_PATH, temp_dir)
561 && GetTempFileName(temp_dir, "qem", 0, filename)
562 ? 0 : -GetLastError());
d5249393 563#else
67b915a5 564 int fd;
7ccfb2eb 565 const char *tmpdir;
0badc1ee 566 tmpdir = getenv("TMPDIR");
69bef793
AS
567 if (!tmpdir) {
568 tmpdir = "/var/tmp";
569 }
eba25057
JM
570 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
571 return -EOVERFLOW;
572 }
ea2384d3 573 fd = mkstemp(filename);
fe235a06
DH
574 if (fd < 0) {
575 return -errno;
576 }
577 if (close(fd) != 0) {
578 unlink(filename);
eba25057
JM
579 return -errno;
580 }
581 return 0;
d5249393 582#endif
eba25057 583}
fc01f7e7 584
84a12e66
CH
585/*
586 * Detect host devices. By convention, /dev/cdrom[N] is always
587 * recognized as a host CDROM.
588 */
589static BlockDriver *find_hdev_driver(const char *filename)
590{
591 int score_max = 0, score;
592 BlockDriver *drv = NULL, *d;
593
594 QLIST_FOREACH(d, &bdrv_drivers, list) {
595 if (d->bdrv_probe_device) {
596 score = d->bdrv_probe_device(filename);
597 if (score > score_max) {
598 score_max = score;
599 drv = d;
600 }
601 }
602 }
603
604 return drv;
605}
606
98289620
KW
607BlockDriver *bdrv_find_protocol(const char *filename,
608 bool allow_protocol_prefix)
83f64091
FB
609{
610 BlockDriver *drv1;
611 char protocol[128];
1cec71e3 612 int len;
83f64091 613 const char *p;
19cb3738 614
66f82cee
KW
615 /* TODO Drivers without bdrv_file_open must be specified explicitly */
616
39508e7a
CH
617 /*
618 * XXX(hch): we really should not let host device detection
619 * override an explicit protocol specification, but moving this
620 * later breaks access to device names with colons in them.
621 * Thanks to the brain-dead persistent naming schemes on udev-
622 * based Linux systems those actually are quite common.
623 */
624 drv1 = find_hdev_driver(filename);
625 if (drv1) {
626 return drv1;
627 }
628
98289620 629 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
39508e7a 630 return bdrv_find_format("file");
84a12e66 631 }
98289620 632
9e0b22f4
SH
633 p = strchr(filename, ':');
634 assert(p != NULL);
1cec71e3
AL
635 len = p - filename;
636 if (len > sizeof(protocol) - 1)
637 len = sizeof(protocol) - 1;
638 memcpy(protocol, filename, len);
639 protocol[len] = '\0';
8a22f02a 640 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
5fafdf24 641 if (drv1->protocol_name &&
8a22f02a 642 !strcmp(drv1->protocol_name, protocol)) {
83f64091 643 return drv1;
8a22f02a 644 }
83f64091
FB
645 }
646 return NULL;
647}
648
f500a6d3 649static int find_image_format(BlockDriverState *bs, const char *filename,
34b5d2c6 650 BlockDriver **pdrv, Error **errp)
f3a5d3f8 651{
f500a6d3 652 int score, score_max;
f3a5d3f8
CH
653 BlockDriver *drv1, *drv;
654 uint8_t buf[2048];
f500a6d3 655 int ret = 0;
f8ea0b00 656
08a00559 657 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
8e895599 658 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
c98ac35d
SW
659 drv = bdrv_find_format("raw");
660 if (!drv) {
34b5d2c6 661 error_setg(errp, "Could not find raw image format");
c98ac35d
SW
662 ret = -ENOENT;
663 }
664 *pdrv = drv;
665 return ret;
1a396859 666 }
f8ea0b00 667
83f64091 668 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
83f64091 669 if (ret < 0) {
34b5d2c6
HR
670 error_setg_errno(errp, -ret, "Could not read image for determining its "
671 "format");
c98ac35d
SW
672 *pdrv = NULL;
673 return ret;
83f64091
FB
674 }
675
ea2384d3 676 score_max = 0;
84a12e66 677 drv = NULL;
8a22f02a 678 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
83f64091
FB
679 if (drv1->bdrv_probe) {
680 score = drv1->bdrv_probe(buf, ret, filename);
681 if (score > score_max) {
682 score_max = score;
683 drv = drv1;
684 }
0849bf08 685 }
fc01f7e7 686 }
c98ac35d 687 if (!drv) {
34b5d2c6
HR
688 error_setg(errp, "Could not determine image format: No compatible "
689 "driver found");
c98ac35d
SW
690 ret = -ENOENT;
691 }
692 *pdrv = drv;
693 return ret;
ea2384d3
FB
694}
695
51762288
SH
696/**
697 * Set the current 'total_sectors' value
698 */
699static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
700{
701 BlockDriver *drv = bs->drv;
702
396759ad
NB
703 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
704 if (bs->sg)
705 return 0;
706
51762288
SH
707 /* query actual device if possible, otherwise just trust the hint */
708 if (drv->bdrv_getlength) {
709 int64_t length = drv->bdrv_getlength(bs);
710 if (length < 0) {
711 return length;
712 }
7e382003 713 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
51762288
SH
714 }
715
716 bs->total_sectors = hint;
717 return 0;
718}
719
9e8f1835
PB
720/**
721 * Set open flags for a given discard mode
722 *
723 * Return 0 on success, -1 if the discard mode was invalid.
724 */
725int bdrv_parse_discard_flags(const char *mode, int *flags)
726{
727 *flags &= ~BDRV_O_UNMAP;
728
729 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
730 /* do nothing */
731 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
732 *flags |= BDRV_O_UNMAP;
733 } else {
734 return -1;
735 }
736
737 return 0;
738}
739
c3993cdc
SH
740/**
741 * Set open flags for a given cache mode
742 *
743 * Return 0 on success, -1 if the cache mode was invalid.
744 */
745int bdrv_parse_cache_flags(const char *mode, int *flags)
746{
747 *flags &= ~BDRV_O_CACHE_MASK;
748
749 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
750 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
92196b2f
SH
751 } else if (!strcmp(mode, "directsync")) {
752 *flags |= BDRV_O_NOCACHE;
c3993cdc
SH
753 } else if (!strcmp(mode, "writeback")) {
754 *flags |= BDRV_O_CACHE_WB;
755 } else if (!strcmp(mode, "unsafe")) {
756 *flags |= BDRV_O_CACHE_WB;
757 *flags |= BDRV_O_NO_FLUSH;
758 } else if (!strcmp(mode, "writethrough")) {
759 /* this is the default */
760 } else {
761 return -1;
762 }
763
764 return 0;
765}
766
53fec9d3
SH
767/**
768 * The copy-on-read flag is actually a reference count so multiple users may
769 * use the feature without worrying about clobbering its previous state.
770 * Copy-on-read stays enabled until all users have called to disable it.
771 */
772void bdrv_enable_copy_on_read(BlockDriverState *bs)
773{
774 bs->copy_on_read++;
775}
776
777void bdrv_disable_copy_on_read(BlockDriverState *bs)
778{
779 assert(bs->copy_on_read > 0);
780 bs->copy_on_read--;
781}
782
b1e6fc08
KW
783/*
784 * Returns the flags that a temporary snapshot should get, based on the
785 * originally requested flags (the originally requested image will have flags
786 * like a backing file)
787 */
788static int bdrv_temp_snapshot_flags(int flags)
789{
790 return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
791}
792
0b50cc88
KW
793/*
794 * Returns the flags that bs->file should get, based on the given flags for
795 * the parent BDS
796 */
797static int bdrv_inherited_flags(int flags)
798{
799 /* Enable protocol handling, disable format probing for bs->file */
800 flags |= BDRV_O_PROTOCOL;
801
802 /* Our block drivers take care to send flushes and respect unmap policy,
803 * so we can enable both unconditionally on lower layers. */
804 flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
805
0b50cc88 806 /* Clear flags that only apply to the top layer */
5669b44d 807 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
0b50cc88
KW
808
809 return flags;
810}
811
317fc44e
KW
812/*
813 * Returns the flags that bs->backing_hd should get, based on the given flags
814 * for the parent BDS
815 */
816static int bdrv_backing_flags(int flags)
817{
818 /* backing files always opened read-only */
819 flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
820
821 /* snapshot=on is handled on the top layer */
8bfea15d 822 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
317fc44e
KW
823
824 return flags;
825}
826
7b272452
KW
827static int bdrv_open_flags(BlockDriverState *bs, int flags)
828{
829 int open_flags = flags | BDRV_O_CACHE_WB;
830
831 /*
832 * Clear flags that are internal to the block layer before opening the
833 * image.
834 */
835 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
836
837 /*
838 * Snapshots should be writable.
839 */
8bfea15d 840 if (flags & BDRV_O_TEMPORARY) {
7b272452
KW
841 open_flags |= BDRV_O_RDWR;
842 }
843
844 return open_flags;
845}
846
636ea370
KW
847static void bdrv_assign_node_name(BlockDriverState *bs,
848 const char *node_name,
849 Error **errp)
6913c0c2
BC
850{
851 if (!node_name) {
636ea370 852 return;
6913c0c2
BC
853 }
854
855 /* empty string node name is invalid */
856 if (node_name[0] == '\0') {
857 error_setg(errp, "Empty node name");
636ea370 858 return;
6913c0c2
BC
859 }
860
0c5e94ee
BC
861 /* takes care of avoiding namespaces collisions */
862 if (bdrv_find(node_name)) {
863 error_setg(errp, "node-name=%s is conflicting with a device id",
864 node_name);
636ea370 865 return;
0c5e94ee
BC
866 }
867
6913c0c2
BC
868 /* takes care of avoiding duplicates node names */
869 if (bdrv_find_node(node_name)) {
870 error_setg(errp, "Duplicate node name");
636ea370 871 return;
6913c0c2
BC
872 }
873
874 /* copy node name into the bs and insert it into the graph list */
875 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
876 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
6913c0c2
BC
877}
878
57915332
KW
879/*
880 * Common part for opening disk images and files
b6ad491a
KW
881 *
882 * Removes all processed options from *options.
57915332 883 */
f500a6d3 884static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
34b5d2c6 885 QDict *options, int flags, BlockDriver *drv, Error **errp)
57915332
KW
886{
887 int ret, open_flags;
035fccdf 888 const char *filename;
6913c0c2 889 const char *node_name = NULL;
34b5d2c6 890 Error *local_err = NULL;
57915332
KW
891
892 assert(drv != NULL);
6405875c 893 assert(bs->file == NULL);
707ff828 894 assert(options != NULL && bs->options != options);
57915332 895
45673671
KW
896 if (file != NULL) {
897 filename = file->filename;
898 } else {
899 filename = qdict_get_try_str(options, "filename");
900 }
901
765003db
KW
902 if (drv->bdrv_needs_filename && !filename) {
903 error_setg(errp, "The '%s' block driver requires a file name",
904 drv->format_name);
905 return -EINVAL;
906 }
907
45673671 908 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
28dcee10 909
6913c0c2 910 node_name = qdict_get_try_str(options, "node-name");
636ea370 911 bdrv_assign_node_name(bs, node_name, &local_err);
0fb6395c 912 if (local_err) {
636ea370
KW
913 error_propagate(errp, local_err);
914 return -EINVAL;
6913c0c2
BC
915 }
916 qdict_del(options, "node-name");
917
5d186eb0
KW
918 /* bdrv_open() with directly using a protocol as drv. This layer is already
919 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
920 * and return immediately. */
921 if (file != NULL && drv->bdrv_file_open) {
922 bdrv_swap(file, bs);
923 return 0;
924 }
925
57915332 926 bs->open_flags = flags;
1b7fd729 927 bs->guest_block_size = 512;
c25f53b0 928 bs->request_alignment = 512;
0d51b4de 929 bs->zero_beyond_eof = true;
b64ec4e4
FZ
930 open_flags = bdrv_open_flags(bs, flags);
931 bs->read_only = !(open_flags & BDRV_O_RDWR);
932
933 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
8f94a6e4
KW
934 error_setg(errp,
935 !bs->read_only && bdrv_is_whitelisted(drv, true)
936 ? "Driver '%s' can only be used for read-only devices"
937 : "Driver '%s' is not whitelisted",
938 drv->format_name);
b64ec4e4
FZ
939 return -ENOTSUP;
940 }
57915332 941
53fec9d3 942 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
0ebd24e0
KW
943 if (flags & BDRV_O_COPY_ON_READ) {
944 if (!bs->read_only) {
945 bdrv_enable_copy_on_read(bs);
946 } else {
947 error_setg(errp, "Can't use copy-on-read on read-only device");
948 return -EINVAL;
949 }
53fec9d3
SH
950 }
951
c2ad1b0c
KW
952 if (filename != NULL) {
953 pstrcpy(bs->filename, sizeof(bs->filename), filename);
954 } else {
955 bs->filename[0] = '\0';
956 }
57915332 957
57915332 958 bs->drv = drv;
7267c094 959 bs->opaque = g_malloc0(drv->instance_size);
57915332 960
03f541bd 961 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
e7c63796 962
66f82cee
KW
963 /* Open the image, either directly or using a protocol */
964 if (drv->bdrv_file_open) {
5d186eb0 965 assert(file == NULL);
030be321 966 assert(!drv->bdrv_needs_filename || filename != NULL);
34b5d2c6 967 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
f500a6d3 968 } else {
2af5ef70 969 if (file == NULL) {
34b5d2c6
HR
970 error_setg(errp, "Can't use '%s' as a block driver for the "
971 "protocol level", drv->format_name);
2af5ef70
KW
972 ret = -EINVAL;
973 goto free_and_fail;
974 }
f500a6d3 975 bs->file = file;
34b5d2c6 976 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
66f82cee
KW
977 }
978
57915332 979 if (ret < 0) {
84d18f06 980 if (local_err) {
34b5d2c6 981 error_propagate(errp, local_err);
2fa9aa59
DH
982 } else if (bs->filename[0]) {
983 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
34b5d2c6
HR
984 } else {
985 error_setg_errno(errp, -ret, "Could not open image");
986 }
57915332
KW
987 goto free_and_fail;
988 }
989
51762288
SH
990 ret = refresh_total_sectors(bs, bs->total_sectors);
991 if (ret < 0) {
34b5d2c6 992 error_setg_errno(errp, -ret, "Could not refresh total sector count");
51762288 993 goto free_and_fail;
57915332 994 }
51762288 995
d34682cd 996 bdrv_refresh_limits(bs);
c25f53b0 997 assert(bdrv_opt_mem_align(bs) != 0);
47ea2de2 998 assert((bs->request_alignment != 0) || bs->sg);
57915332
KW
999 return 0;
1000
1001free_and_fail:
f500a6d3 1002 bs->file = NULL;
7267c094 1003 g_free(bs->opaque);
57915332
KW
1004 bs->opaque = NULL;
1005 bs->drv = NULL;
1006 return ret;
1007}
1008
b6ce07aa
KW
1009/*
1010 * Opens a file using a protocol (file, host_device, nbd, ...)
787e4a85 1011 *
5acd9d81
HR
1012 * options is an indirect pointer to a QDict of options to pass to the block
1013 * drivers, or pointer to NULL for an empty set of options. If this function
1014 * takes ownership of the QDict reference, it will set *options to NULL;
1015 * otherwise, it will contain unused/unrecognized options after this function
1016 * returns. Then, the caller is responsible for freeing it. If it intends to
1017 * reuse the QDict, QINCREF() should be called beforehand.
b6ce07aa 1018 */
d4446eae 1019static int bdrv_file_open(BlockDriverState *bs, const char *filename,
5acd9d81 1020 QDict **options, int flags, Error **errp)
ea2384d3 1021{
6db95603 1022 BlockDriver *drv;
c2ad1b0c 1023 const char *drvname;
e3fa4bfa 1024 bool parse_filename = false;
34b5d2c6 1025 Error *local_err = NULL;
83f64091
FB
1026 int ret;
1027
035fccdf
KW
1028 /* Fetch the file name from the options QDict if necessary */
1029 if (!filename) {
5acd9d81
HR
1030 filename = qdict_get_try_str(*options, "filename");
1031 } else if (filename && !qdict_haskey(*options, "filename")) {
1032 qdict_put(*options, "filename", qstring_from_str(filename));
e3fa4bfa 1033 parse_filename = true;
035fccdf 1034 } else {
34b5d2c6
HR
1035 error_setg(errp, "Can't specify 'file' and 'filename' options at the "
1036 "same time");
035fccdf
KW
1037 ret = -EINVAL;
1038 goto fail;
1039 }
1040
c2ad1b0c 1041 /* Find the right block driver */
5acd9d81 1042 drvname = qdict_get_try_str(*options, "driver");
c2ad1b0c 1043 if (drvname) {
8f94a6e4 1044 drv = bdrv_find_format(drvname);
34b5d2c6
HR
1045 if (!drv) {
1046 error_setg(errp, "Unknown driver '%s'", drvname);
1047 }
5acd9d81 1048 qdict_del(*options, "driver");
c2ad1b0c 1049 } else if (filename) {
e3fa4bfa 1050 drv = bdrv_find_protocol(filename, parse_filename);
98289620 1051 if (!drv) {
34b5d2c6 1052 error_setg(errp, "Unknown protocol");
98289620 1053 }
c2ad1b0c 1054 } else {
34b5d2c6 1055 error_setg(errp, "Must specify either driver or file");
c2ad1b0c
KW
1056 drv = NULL;
1057 }
1058
1059 if (!drv) {
34b5d2c6 1060 /* errp has been set already */
c2ad1b0c
KW
1061 ret = -ENOENT;
1062 goto fail;
1063 }
1064
1065 /* Parse the filename and open it */
e3fa4bfa 1066 if (drv->bdrv_parse_filename && parse_filename) {
5acd9d81 1067 drv->bdrv_parse_filename(filename, *options, &local_err);
84d18f06 1068 if (local_err) {
34b5d2c6 1069 error_propagate(errp, local_err);
6963a30d
KW
1070 ret = -EINVAL;
1071 goto fail;
1072 }
cd5d031e
HR
1073
1074 if (!drv->bdrv_needs_filename) {
1075 qdict_del(*options, "filename");
1076 } else {
1077 filename = qdict_get_str(*options, "filename");
1078 }
6963a30d
KW
1079 }
1080
505d7583 1081 if (!drv->bdrv_file_open) {
5acd9d81
HR
1082 ret = bdrv_open(&bs, filename, NULL, *options, flags, drv, &local_err);
1083 *options = NULL;
505d7583 1084 } else {
5acd9d81 1085 ret = bdrv_open_common(bs, NULL, *options, flags, drv, &local_err);
505d7583 1086 }
83f64091 1087 if (ret < 0) {
34b5d2c6 1088 error_propagate(errp, local_err);
707ff828
KW
1089 goto fail;
1090 }
1091
71d0770c 1092 bs->growable = 1;
83f64091 1093 return 0;
707ff828
KW
1094
1095fail:
707ff828 1096 return ret;
83f64091
FB
1097}
1098
8d24cce1
FZ
1099void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1100{
1101
826b6ca0
FZ
1102 if (bs->backing_hd) {
1103 assert(bs->backing_blocker);
1104 bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1105 } else if (backing_hd) {
1106 error_setg(&bs->backing_blocker,
1107 "device is used as backing hd of '%s'",
1108 bs->device_name);
1109 }
1110
8d24cce1
FZ
1111 bs->backing_hd = backing_hd;
1112 if (!backing_hd) {
826b6ca0
FZ
1113 error_free(bs->backing_blocker);
1114 bs->backing_blocker = NULL;
8d24cce1
FZ
1115 goto out;
1116 }
1117 bs->open_flags &= ~BDRV_O_NO_BACKING;
1118 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1119 pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1120 backing_hd->drv ? backing_hd->drv->format_name : "");
826b6ca0
FZ
1121
1122 bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1123 /* Otherwise we won't be able to commit due to check in bdrv_commit */
1124 bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
1125 bs->backing_blocker);
8d24cce1
FZ
1126out:
1127 bdrv_refresh_limits(bs);
1128}
1129
31ca6d07
KW
1130/*
1131 * Opens the backing file for a BlockDriverState if not yet open
1132 *
1133 * options is a QDict of options to pass to the block drivers, or NULL for an
1134 * empty set of options. The reference to the QDict is transferred to this
1135 * function (even on failure), so if the caller intends to reuse the dictionary,
1136 * it needs to use QINCREF() before calling bdrv_file_open.
1137 */
34b5d2c6 1138int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
9156df12 1139{
1ba4b6a5 1140 char *backing_filename = g_malloc0(PATH_MAX);
317fc44e 1141 int ret = 0;
9156df12 1142 BlockDriver *back_drv = NULL;
8d24cce1 1143 BlockDriverState *backing_hd;
34b5d2c6 1144 Error *local_err = NULL;
9156df12
PB
1145
1146 if (bs->backing_hd != NULL) {
31ca6d07 1147 QDECREF(options);
1ba4b6a5 1148 goto free_exit;
9156df12
PB
1149 }
1150
31ca6d07
KW
1151 /* NULL means an empty set of options */
1152 if (options == NULL) {
1153 options = qdict_new();
1154 }
1155
9156df12 1156 bs->open_flags &= ~BDRV_O_NO_BACKING;
1cb6f506
KW
1157 if (qdict_haskey(options, "file.filename")) {
1158 backing_filename[0] = '\0';
1159 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
31ca6d07 1160 QDECREF(options);
1ba4b6a5 1161 goto free_exit;
dbecebdd 1162 } else {
1ba4b6a5 1163 bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
9156df12
PB
1164 }
1165
8d24cce1
FZ
1166 backing_hd = bdrv_new("", errp);
1167
9156df12
PB
1168 if (bs->backing_format[0] != '\0') {
1169 back_drv = bdrv_find_format(bs->backing_format);
1170 }
1171
f67503e5 1172 assert(bs->backing_hd == NULL);
8d24cce1 1173 ret = bdrv_open(&backing_hd,
ddf5636d 1174 *backing_filename ? backing_filename : NULL, NULL, options,
317fc44e 1175 bdrv_backing_flags(bs->open_flags), back_drv, &local_err);
9156df12 1176 if (ret < 0) {
8d24cce1
FZ
1177 bdrv_unref(backing_hd);
1178 backing_hd = NULL;
9156df12 1179 bs->open_flags |= BDRV_O_NO_BACKING;
b04b6b6e
FZ
1180 error_setg(errp, "Could not open backing file: %s",
1181 error_get_pretty(local_err));
1182 error_free(local_err);
1ba4b6a5 1183 goto free_exit;
9156df12 1184 }
8d24cce1 1185 bdrv_set_backing_hd(bs, backing_hd);
d80ac658 1186
1ba4b6a5
BC
1187free_exit:
1188 g_free(backing_filename);
1189 return ret;
9156df12
PB
1190}
1191
da557aac
HR
1192/*
1193 * Opens a disk image whose options are given as BlockdevRef in another block
1194 * device's options.
1195 *
da557aac
HR
1196 * If allow_none is true, no image will be opened if filename is false and no
1197 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1198 *
1199 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1200 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1201 * itself, all options starting with "${bdref_key}." are considered part of the
1202 * BlockdevRef.
1203 *
1204 * The BlockdevRef will be removed from the options QDict.
f67503e5
HR
1205 *
1206 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
da557aac
HR
1207 */
1208int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1209 QDict *options, const char *bdref_key, int flags,
f7d9fd8c 1210 bool allow_none, Error **errp)
da557aac
HR
1211{
1212 QDict *image_options;
1213 int ret;
1214 char *bdref_key_dot;
1215 const char *reference;
1216
f67503e5
HR
1217 assert(pbs);
1218 assert(*pbs == NULL);
1219
da557aac
HR
1220 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1221 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1222 g_free(bdref_key_dot);
1223
1224 reference = qdict_get_try_str(options, bdref_key);
1225 if (!filename && !reference && !qdict_size(image_options)) {
1226 if (allow_none) {
1227 ret = 0;
1228 } else {
1229 error_setg(errp, "A block device must be specified for \"%s\"",
1230 bdref_key);
1231 ret = -EINVAL;
1232 }
b20e61e0 1233 QDECREF(image_options);
da557aac
HR
1234 goto done;
1235 }
1236
f7d9fd8c 1237 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
da557aac
HR
1238
1239done:
1240 qdict_del(options, bdref_key);
1241 return ret;
1242}
1243
b1e6fc08 1244void bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
b998875d
KW
1245{
1246 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1ba4b6a5 1247 char *tmp_filename = g_malloc0(PATH_MAX + 1);
b998875d
KW
1248 int64_t total_size;
1249 BlockDriver *bdrv_qcow2;
1250 QEMUOptionParameter *create_options;
1251 QDict *snapshot_options;
1252 BlockDriverState *bs_snapshot;
1253 Error *local_err;
1254 int ret;
1255
1256 /* if snapshot, we create a temporary backing file and open it
1257 instead of opening 'filename' directly */
1258
1259 /* Get the required size from the image */
f187743a
KW
1260 total_size = bdrv_getlength(bs);
1261 if (total_size < 0) {
1262 error_setg_errno(errp, -total_size, "Could not get image size");
1ba4b6a5 1263 goto out;
f187743a
KW
1264 }
1265 total_size &= BDRV_SECTOR_MASK;
b998875d
KW
1266
1267 /* Create the temporary image */
1ba4b6a5 1268 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
b998875d
KW
1269 if (ret < 0) {
1270 error_setg_errno(errp, -ret, "Could not get temporary filename");
1ba4b6a5 1271 goto out;
b998875d
KW
1272 }
1273
1274 bdrv_qcow2 = bdrv_find_format("qcow2");
1275 create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1276 NULL);
1277
1278 set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1279
1280 ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1281 free_option_parameters(create_options);
1282 if (ret < 0) {
1283 error_setg_errno(errp, -ret, "Could not create temporary overlay "
1284 "'%s': %s", tmp_filename,
1285 error_get_pretty(local_err));
1286 error_free(local_err);
1ba4b6a5 1287 goto out;
b998875d
KW
1288 }
1289
1290 /* Prepare a new options QDict for the temporary file */
1291 snapshot_options = qdict_new();
1292 qdict_put(snapshot_options, "file.driver",
1293 qstring_from_str("file"));
1294 qdict_put(snapshot_options, "file.filename",
1295 qstring_from_str(tmp_filename));
1296
98522f63 1297 bs_snapshot = bdrv_new("", &error_abort);
b998875d
KW
1298
1299 ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
b1e6fc08 1300 flags, bdrv_qcow2, &local_err);
b998875d
KW
1301 if (ret < 0) {
1302 error_propagate(errp, local_err);
1ba4b6a5 1303 goto out;
b998875d
KW
1304 }
1305
1306 bdrv_append(bs_snapshot, bs);
1ba4b6a5
BC
1307
1308out:
1309 g_free(tmp_filename);
b998875d
KW
1310}
1311
4993f7ea
HR
1312static QDict *parse_json_filename(const char *filename, Error **errp)
1313{
1314 QObject *options_obj;
1315 QDict *options;
1316 int ret;
1317
1318 ret = strstart(filename, "json:", &filename);
1319 assert(ret);
1320
1321 options_obj = qobject_from_json(filename);
1322 if (!options_obj) {
1323 error_setg(errp, "Could not parse the JSON options");
1324 return NULL;
1325 }
1326
1327 if (qobject_type(options_obj) != QTYPE_QDICT) {
1328 qobject_decref(options_obj);
1329 error_setg(errp, "Invalid JSON object given");
1330 return NULL;
1331 }
1332
1333 options = qobject_to_qdict(options_obj);
1334 qdict_flatten(options);
1335
1336 return options;
1337}
1338
b6ce07aa
KW
1339/*
1340 * Opens a disk image (raw, qcow2, vmdk, ...)
de9c0cec
KW
1341 *
1342 * options is a QDict of options to pass to the block drivers, or NULL for an
1343 * empty set of options. The reference to the QDict belongs to the block layer
1344 * after the call (even on failure), so if the caller intends to reuse the
1345 * dictionary, it needs to use QINCREF() before calling bdrv_open.
f67503e5
HR
1346 *
1347 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1348 * If it is not NULL, the referenced BDS will be reused.
ddf5636d
HR
1349 *
1350 * The reference parameter may be used to specify an existing block device which
1351 * should be opened. If specified, neither options nor a filename may be given,
1352 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
b6ce07aa 1353 */
ddf5636d
HR
1354int bdrv_open(BlockDriverState **pbs, const char *filename,
1355 const char *reference, QDict *options, int flags,
1356 BlockDriver *drv, Error **errp)
ea2384d3 1357{
b6ce07aa 1358 int ret;
f67503e5 1359 BlockDriverState *file = NULL, *bs;
74fe54f2 1360 const char *drvname;
34b5d2c6 1361 Error *local_err = NULL;
b1e6fc08 1362 int snapshot_flags = 0;
712e7874 1363
f67503e5
HR
1364 assert(pbs);
1365
ddf5636d
HR
1366 if (reference) {
1367 bool options_non_empty = options ? qdict_size(options) : false;
1368 QDECREF(options);
1369
1370 if (*pbs) {
1371 error_setg(errp, "Cannot reuse an existing BDS when referencing "
1372 "another block device");
1373 return -EINVAL;
1374 }
1375
1376 if (filename || options_non_empty) {
1377 error_setg(errp, "Cannot reference an existing block device with "
1378 "additional options or a new filename");
1379 return -EINVAL;
1380 }
1381
1382 bs = bdrv_lookup_bs(reference, reference, errp);
1383 if (!bs) {
1384 return -ENODEV;
1385 }
1386 bdrv_ref(bs);
1387 *pbs = bs;
1388 return 0;
1389 }
1390
f67503e5
HR
1391 if (*pbs) {
1392 bs = *pbs;
1393 } else {
98522f63 1394 bs = bdrv_new("", &error_abort);
f67503e5
HR
1395 }
1396
de9c0cec
KW
1397 /* NULL means an empty set of options */
1398 if (options == NULL) {
1399 options = qdict_new();
1400 }
1401
4993f7ea
HR
1402 if (filename && g_str_has_prefix(filename, "json:")) {
1403 QDict *json_options = parse_json_filename(filename, &local_err);
1404 if (local_err) {
1405 ret = -EINVAL;
1406 goto fail;
1407 }
1408
1409 /* Options given in the filename have lower priority than options
1410 * specified directly */
1411 qdict_join(options, json_options, false);
1412 QDECREF(json_options);
1413 filename = NULL;
1414 }
1415
de9c0cec 1416 bs->options = options;
b6ad491a 1417 options = qdict_clone_shallow(options);
de9c0cec 1418
5469a2a6
HR
1419 if (flags & BDRV_O_PROTOCOL) {
1420 assert(!drv);
5acd9d81 1421 ret = bdrv_file_open(bs, filename, &options, flags & ~BDRV_O_PROTOCOL,
5469a2a6 1422 &local_err);
5469a2a6 1423 if (!ret) {
eb909c7f 1424 drv = bs->drv;
5acd9d81 1425 goto done;
5469a2a6
HR
1426 } else if (bs->drv) {
1427 goto close_and_fail;
1428 } else {
1429 goto fail;
1430 }
1431 }
1432
f500a6d3
KW
1433 /* Open image file without format layer */
1434 if (flags & BDRV_O_RDWR) {
1435 flags |= BDRV_O_ALLOW_RDWR;
1436 }
b1e6fc08
KW
1437 if (flags & BDRV_O_SNAPSHOT) {
1438 snapshot_flags = bdrv_temp_snapshot_flags(flags);
1439 flags = bdrv_backing_flags(flags);
1440 }
f500a6d3 1441
f67503e5 1442 assert(file == NULL);
054963f8 1443 ret = bdrv_open_image(&file, filename, options, "file",
0b50cc88
KW
1444 bdrv_inherited_flags(flags),
1445 true, &local_err);
054963f8 1446 if (ret < 0) {
8bfea15d 1447 goto fail;
f500a6d3
KW
1448 }
1449
b6ce07aa 1450 /* Find the right image format driver */
74fe54f2
KW
1451 drvname = qdict_get_try_str(options, "driver");
1452 if (drvname) {
8f94a6e4 1453 drv = bdrv_find_format(drvname);
74fe54f2 1454 qdict_del(options, "driver");
06d22aa3
KW
1455 if (!drv) {
1456 error_setg(errp, "Invalid driver: '%s'", drvname);
1457 ret = -EINVAL;
8bfea15d 1458 goto fail;
06d22aa3 1459 }
74fe54f2
KW
1460 }
1461
6db95603 1462 if (!drv) {
2a05cbe4
HR
1463 if (file) {
1464 ret = find_image_format(file, filename, &drv, &local_err);
1465 } else {
1466 error_setg(errp, "Must specify either driver or file");
1467 ret = -EINVAL;
8bfea15d 1468 goto fail;
2a05cbe4 1469 }
51d7c00c 1470 }
6987307c 1471
51d7c00c 1472 if (!drv) {
8bfea15d 1473 goto fail;
ea2384d3 1474 }
b6ce07aa
KW
1475
1476 /* Open the image */
34b5d2c6 1477 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
b6ce07aa 1478 if (ret < 0) {
8bfea15d 1479 goto fail;
6987307c
CH
1480 }
1481
2a05cbe4 1482 if (file && (bs->file != file)) {
4f6fd349 1483 bdrv_unref(file);
f500a6d3
KW
1484 file = NULL;
1485 }
1486
b6ce07aa 1487 /* If there is a backing file, use it */
9156df12 1488 if ((flags & BDRV_O_NO_BACKING) == 0) {
31ca6d07
KW
1489 QDict *backing_options;
1490
5726d872 1491 qdict_extract_subqdict(options, &backing_options, "backing.");
34b5d2c6 1492 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
b6ce07aa 1493 if (ret < 0) {
b6ad491a 1494 goto close_and_fail;
b6ce07aa 1495 }
b6ce07aa
KW
1496 }
1497
b998875d
KW
1498 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1499 * temporary snapshot afterwards. */
b1e6fc08
KW
1500 if (snapshot_flags) {
1501 bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
b998875d
KW
1502 if (local_err) {
1503 error_propagate(errp, local_err);
1504 goto close_and_fail;
1505 }
1506 }
1507
1508
5acd9d81 1509done:
b6ad491a 1510 /* Check if any unknown options were used */
5acd9d81 1511 if (options && (qdict_size(options) != 0)) {
b6ad491a 1512 const QDictEntry *entry = qdict_first(options);
5acd9d81
HR
1513 if (flags & BDRV_O_PROTOCOL) {
1514 error_setg(errp, "Block protocol '%s' doesn't support the option "
1515 "'%s'", drv->format_name, entry->key);
1516 } else {
1517 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1518 "support the option '%s'", drv->format_name,
1519 bs->device_name, entry->key);
1520 }
b6ad491a
KW
1521
1522 ret = -EINVAL;
1523 goto close_and_fail;
1524 }
b6ad491a 1525
b6ce07aa 1526 if (!bdrv_key_required(bs)) {
7d4b4ba5 1527 bdrv_dev_change_media_cb(bs, true);
c3adb58f
MA
1528 } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1529 && !runstate_check(RUN_STATE_INMIGRATE)
1530 && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1531 error_setg(errp,
1532 "Guest must be stopped for opening of encrypted image");
1533 ret = -EBUSY;
1534 goto close_and_fail;
b6ce07aa
KW
1535 }
1536
c3adb58f 1537 QDECREF(options);
f67503e5 1538 *pbs = bs;
b6ce07aa
KW
1539 return 0;
1540
8bfea15d 1541fail:
f500a6d3 1542 if (file != NULL) {
4f6fd349 1543 bdrv_unref(file);
f500a6d3 1544 }
de9c0cec 1545 QDECREF(bs->options);
b6ad491a 1546 QDECREF(options);
de9c0cec 1547 bs->options = NULL;
f67503e5
HR
1548 if (!*pbs) {
1549 /* If *pbs is NULL, a new BDS has been created in this function and
1550 needs to be freed now. Otherwise, it does not need to be closed,
1551 since it has not really been opened yet. */
1552 bdrv_unref(bs);
1553 }
84d18f06 1554 if (local_err) {
34b5d2c6
HR
1555 error_propagate(errp, local_err);
1556 }
b6ad491a 1557 return ret;
de9c0cec 1558
b6ad491a 1559close_and_fail:
f67503e5
HR
1560 /* See fail path, but now the BDS has to be always closed */
1561 if (*pbs) {
1562 bdrv_close(bs);
1563 } else {
1564 bdrv_unref(bs);
1565 }
b6ad491a 1566 QDECREF(options);
84d18f06 1567 if (local_err) {
34b5d2c6
HR
1568 error_propagate(errp, local_err);
1569 }
b6ce07aa
KW
1570 return ret;
1571}
1572
e971aa12
JC
1573typedef struct BlockReopenQueueEntry {
1574 bool prepared;
1575 BDRVReopenState state;
1576 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1577} BlockReopenQueueEntry;
1578
1579/*
1580 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1581 * reopen of multiple devices.
1582 *
1583 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1584 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1585 * be created and initialized. This newly created BlockReopenQueue should be
1586 * passed back in for subsequent calls that are intended to be of the same
1587 * atomic 'set'.
1588 *
1589 * bs is the BlockDriverState to add to the reopen queue.
1590 *
1591 * flags contains the open flags for the associated bs
1592 *
1593 * returns a pointer to bs_queue, which is either the newly allocated
1594 * bs_queue, or the existing bs_queue being used.
1595 *
1596 */
1597BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1598 BlockDriverState *bs, int flags)
1599{
1600 assert(bs != NULL);
1601
1602 BlockReopenQueueEntry *bs_entry;
1603 if (bs_queue == NULL) {
1604 bs_queue = g_new0(BlockReopenQueue, 1);
1605 QSIMPLEQ_INIT(bs_queue);
1606 }
1607
f1f25a2e
KW
1608 /* bdrv_open() masks this flag out */
1609 flags &= ~BDRV_O_PROTOCOL;
1610
e971aa12 1611 if (bs->file) {
f1f25a2e 1612 bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
e971aa12
JC
1613 }
1614
1615 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1616 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1617
1618 bs_entry->state.bs = bs;
1619 bs_entry->state.flags = flags;
1620
1621 return bs_queue;
1622}
1623
1624/*
1625 * Reopen multiple BlockDriverStates atomically & transactionally.
1626 *
1627 * The queue passed in (bs_queue) must have been built up previous
1628 * via bdrv_reopen_queue().
1629 *
1630 * Reopens all BDS specified in the queue, with the appropriate
1631 * flags. All devices are prepared for reopen, and failure of any
1632 * device will cause all device changes to be abandonded, and intermediate
1633 * data cleaned up.
1634 *
1635 * If all devices prepare successfully, then the changes are committed
1636 * to all devices.
1637 *
1638 */
1639int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1640{
1641 int ret = -1;
1642 BlockReopenQueueEntry *bs_entry, *next;
1643 Error *local_err = NULL;
1644
1645 assert(bs_queue != NULL);
1646
1647 bdrv_drain_all();
1648
1649 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1650 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1651 error_propagate(errp, local_err);
1652 goto cleanup;
1653 }
1654 bs_entry->prepared = true;
1655 }
1656
1657 /* If we reach this point, we have success and just need to apply the
1658 * changes
1659 */
1660 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1661 bdrv_reopen_commit(&bs_entry->state);
1662 }
1663
1664 ret = 0;
1665
1666cleanup:
1667 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1668 if (ret && bs_entry->prepared) {
1669 bdrv_reopen_abort(&bs_entry->state);
1670 }
1671 g_free(bs_entry);
1672 }
1673 g_free(bs_queue);
1674 return ret;
1675}
1676
1677
1678/* Reopen a single BlockDriverState with the specified flags. */
1679int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1680{
1681 int ret = -1;
1682 Error *local_err = NULL;
1683 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1684
1685 ret = bdrv_reopen_multiple(queue, &local_err);
1686 if (local_err != NULL) {
1687 error_propagate(errp, local_err);
1688 }
1689 return ret;
1690}
1691
1692
1693/*
1694 * Prepares a BlockDriverState for reopen. All changes are staged in the
1695 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1696 * the block driver layer .bdrv_reopen_prepare()
1697 *
1698 * bs is the BlockDriverState to reopen
1699 * flags are the new open flags
1700 * queue is the reopen queue
1701 *
1702 * Returns 0 on success, non-zero on error. On error errp will be set
1703 * as well.
1704 *
1705 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1706 * It is the responsibility of the caller to then call the abort() or
1707 * commit() for any other BDS that have been left in a prepare() state
1708 *
1709 */
1710int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1711 Error **errp)
1712{
1713 int ret = -1;
1714 Error *local_err = NULL;
1715 BlockDriver *drv;
1716
1717 assert(reopen_state != NULL);
1718 assert(reopen_state->bs->drv != NULL);
1719 drv = reopen_state->bs->drv;
1720
1721 /* if we are to stay read-only, do not allow permission change
1722 * to r/w */
1723 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1724 reopen_state->flags & BDRV_O_RDWR) {
1725 error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1726 reopen_state->bs->device_name);
1727 goto error;
1728 }
1729
1730
1731 ret = bdrv_flush(reopen_state->bs);
1732 if (ret) {
1733 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1734 strerror(-ret));
1735 goto error;
1736 }
1737
1738 if (drv->bdrv_reopen_prepare) {
1739 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1740 if (ret) {
1741 if (local_err != NULL) {
1742 error_propagate(errp, local_err);
1743 } else {
d8b6895f
LC
1744 error_setg(errp, "failed while preparing to reopen image '%s'",
1745 reopen_state->bs->filename);
e971aa12
JC
1746 }
1747 goto error;
1748 }
1749 } else {
1750 /* It is currently mandatory to have a bdrv_reopen_prepare()
1751 * handler for each supported drv. */
1752 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1753 drv->format_name, reopen_state->bs->device_name,
1754 "reopening of file");
1755 ret = -1;
1756 goto error;
1757 }
1758
1759 ret = 0;
1760
1761error:
1762 return ret;
1763}
1764
1765/*
1766 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1767 * makes them final by swapping the staging BlockDriverState contents into
1768 * the active BlockDriverState contents.
1769 */
1770void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1771{
1772 BlockDriver *drv;
1773
1774 assert(reopen_state != NULL);
1775 drv = reopen_state->bs->drv;
1776 assert(drv != NULL);
1777
1778 /* If there are any driver level actions to take */
1779 if (drv->bdrv_reopen_commit) {
1780 drv->bdrv_reopen_commit(reopen_state);
1781 }
1782
1783 /* set BDS specific flags now */
1784 reopen_state->bs->open_flags = reopen_state->flags;
1785 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1786 BDRV_O_CACHE_WB);
1787 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
355ef4ac
KW
1788
1789 bdrv_refresh_limits(reopen_state->bs);
e971aa12
JC
1790}
1791
1792/*
1793 * Abort the reopen, and delete and free the staged changes in
1794 * reopen_state
1795 */
1796void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1797{
1798 BlockDriver *drv;
1799
1800 assert(reopen_state != NULL);
1801 drv = reopen_state->bs->drv;
1802 assert(drv != NULL);
1803
1804 if (drv->bdrv_reopen_abort) {
1805 drv->bdrv_reopen_abort(reopen_state);
1806 }
1807}
1808
1809
fc01f7e7
FB
1810void bdrv_close(BlockDriverState *bs)
1811{
3cbc002c
PB
1812 if (bs->job) {
1813 block_job_cancel_sync(bs->job);
1814 }
58fda173
SH
1815 bdrv_drain_all(); /* complete I/O */
1816 bdrv_flush(bs);
1817 bdrv_drain_all(); /* in case flush left pending I/O */
d7d512f6 1818 notifier_list_notify(&bs->close_notifiers, bs);
7094f12f 1819
3cbc002c 1820 if (bs->drv) {
557df6ac 1821 if (bs->backing_hd) {
826b6ca0
FZ
1822 BlockDriverState *backing_hd = bs->backing_hd;
1823 bdrv_set_backing_hd(bs, NULL);
1824 bdrv_unref(backing_hd);
557df6ac 1825 }
ea2384d3 1826 bs->drv->bdrv_close(bs);
7267c094 1827 g_free(bs->opaque);
ea2384d3
FB
1828 bs->opaque = NULL;
1829 bs->drv = NULL;
53fec9d3 1830 bs->copy_on_read = 0;
a275fa42
PB
1831 bs->backing_file[0] = '\0';
1832 bs->backing_format[0] = '\0';
6405875c
PB
1833 bs->total_sectors = 0;
1834 bs->encrypted = 0;
1835 bs->valid_key = 0;
1836 bs->sg = 0;
1837 bs->growable = 0;
0d51b4de 1838 bs->zero_beyond_eof = false;
de9c0cec
KW
1839 QDECREF(bs->options);
1840 bs->options = NULL;
b338082b 1841
66f82cee 1842 if (bs->file != NULL) {
4f6fd349 1843 bdrv_unref(bs->file);
0ac9377d 1844 bs->file = NULL;
66f82cee 1845 }
b338082b 1846 }
98f90dba 1847
9ca11154
PH
1848 bdrv_dev_change_media_cb(bs, false);
1849
98f90dba
ZYW
1850 /*throttling disk I/O limits*/
1851 if (bs->io_limits_enabled) {
1852 bdrv_io_limits_disable(bs);
1853 }
b338082b
FB
1854}
1855
2bc93fed
MK
1856void bdrv_close_all(void)
1857{
1858 BlockDriverState *bs;
1859
dc364f4c 1860 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
1861 AioContext *aio_context = bdrv_get_aio_context(bs);
1862
1863 aio_context_acquire(aio_context);
2bc93fed 1864 bdrv_close(bs);
ed78cda3 1865 aio_context_release(aio_context);
2bc93fed
MK
1866 }
1867}
1868
88266f5a
SH
1869/* Check if any requests are in-flight (including throttled requests) */
1870static bool bdrv_requests_pending(BlockDriverState *bs)
1871{
1872 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1873 return true;
1874 }
cc0681c4
BC
1875 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1876 return true;
1877 }
1878 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
88266f5a
SH
1879 return true;
1880 }
1881 if (bs->file && bdrv_requests_pending(bs->file)) {
1882 return true;
1883 }
1884 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1885 return true;
1886 }
1887 return false;
1888}
1889
922453bc
SH
1890/*
1891 * Wait for pending requests to complete across all BlockDriverStates
1892 *
1893 * This function does not flush data to disk, use bdrv_flush_all() for that
1894 * after calling this function.
4c355d53
ZYW
1895 *
1896 * Note that completion of an asynchronous I/O operation can trigger any
1897 * number of other I/O operations on other devices---for example a coroutine
1898 * can be arbitrarily complex and a constant flow of I/O can come until the
1899 * coroutine is complete. Because of this, it is not possible to have a
1900 * function to drain a single device's I/O queue.
922453bc
SH
1901 */
1902void bdrv_drain_all(void)
1903{
88266f5a
SH
1904 /* Always run first iteration so any pending completion BHs run */
1905 bool busy = true;
922453bc
SH
1906 BlockDriverState *bs;
1907
88266f5a 1908 while (busy) {
9b536adc
SH
1909 busy = false;
1910
dc364f4c 1911 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
9b536adc
SH
1912 AioContext *aio_context = bdrv_get_aio_context(bs);
1913 bool bs_busy;
1914
1915 aio_context_acquire(aio_context);
0b06ef3b 1916 bdrv_start_throttled_reqs(bs);
9b536adc
SH
1917 bs_busy = bdrv_requests_pending(bs);
1918 bs_busy |= aio_poll(aio_context, bs_busy);
1919 aio_context_release(aio_context);
922453bc 1920
9b536adc
SH
1921 busy |= bs_busy;
1922 }
922453bc
SH
1923 }
1924}
1925
dc364f4c
BC
1926/* make a BlockDriverState anonymous by removing from bdrv_state and
1927 * graph_bdrv_state list.
d22b2f41
RH
1928 Also, NULL terminate the device_name to prevent double remove */
1929void bdrv_make_anon(BlockDriverState *bs)
1930{
1931 if (bs->device_name[0] != '\0') {
dc364f4c 1932 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
d22b2f41
RH
1933 }
1934 bs->device_name[0] = '\0';
dc364f4c
BC
1935 if (bs->node_name[0] != '\0') {
1936 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1937 }
1938 bs->node_name[0] = '\0';
d22b2f41
RH
1939}
1940
e023b2e2
PB
1941static void bdrv_rebind(BlockDriverState *bs)
1942{
1943 if (bs->drv && bs->drv->bdrv_rebind) {
1944 bs->drv->bdrv_rebind(bs);
1945 }
1946}
1947
4ddc07ca
PB
1948static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1949 BlockDriverState *bs_src)
8802d1fd 1950{
4ddc07ca 1951 /* move some fields that need to stay attached to the device */
8802d1fd
JC
1952
1953 /* dev info */
4ddc07ca
PB
1954 bs_dest->dev_ops = bs_src->dev_ops;
1955 bs_dest->dev_opaque = bs_src->dev_opaque;
1956 bs_dest->dev = bs_src->dev;
1b7fd729 1957 bs_dest->guest_block_size = bs_src->guest_block_size;
4ddc07ca 1958 bs_dest->copy_on_read = bs_src->copy_on_read;
8802d1fd 1959
4ddc07ca 1960 bs_dest->enable_write_cache = bs_src->enable_write_cache;
c4a248a1 1961
cc0681c4
BC
1962 /* i/o throttled req */
1963 memcpy(&bs_dest->throttle_state,
1964 &bs_src->throttle_state,
1965 sizeof(ThrottleState));
1966 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
1967 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
4ddc07ca 1968 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
8802d1fd 1969
8802d1fd 1970 /* r/w error */
4ddc07ca
PB
1971 bs_dest->on_read_error = bs_src->on_read_error;
1972 bs_dest->on_write_error = bs_src->on_write_error;
8802d1fd
JC
1973
1974 /* i/o status */
4ddc07ca
PB
1975 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
1976 bs_dest->iostatus = bs_src->iostatus;
8802d1fd 1977
a9fc4408 1978 /* dirty bitmap */
e4654d2d 1979 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
a9fc4408 1980
9fcb0251
FZ
1981 /* reference count */
1982 bs_dest->refcnt = bs_src->refcnt;
1983
a9fc4408 1984 /* job */
4ddc07ca 1985 bs_dest->job = bs_src->job;
a9fc4408 1986
8802d1fd 1987 /* keep the same entry in bdrv_states */
4ddc07ca
PB
1988 pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1989 bs_src->device_name);
dc364f4c 1990 bs_dest->device_list = bs_src->device_list;
fbe40ff7
FZ
1991 memcpy(bs_dest->op_blockers, bs_src->op_blockers,
1992 sizeof(bs_dest->op_blockers));
4ddc07ca 1993}
8802d1fd 1994
4ddc07ca
PB
1995/*
1996 * Swap bs contents for two image chains while they are live,
1997 * while keeping required fields on the BlockDriverState that is
1998 * actually attached to a device.
1999 *
2000 * This will modify the BlockDriverState fields, and swap contents
2001 * between bs_new and bs_old. Both bs_new and bs_old are modified.
2002 *
2003 * bs_new is required to be anonymous.
2004 *
2005 * This function does not create any image files.
2006 */
2007void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2008{
2009 BlockDriverState tmp;
f6801b83 2010
90ce8a06
BC
2011 /* The code needs to swap the node_name but simply swapping node_list won't
2012 * work so first remove the nodes from the graph list, do the swap then
2013 * insert them back if needed.
2014 */
2015 if (bs_new->node_name[0] != '\0') {
2016 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2017 }
2018 if (bs_old->node_name[0] != '\0') {
2019 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2020 }
2021
4ddc07ca
PB
2022 /* bs_new must be anonymous and shouldn't have anything fancy enabled */
2023 assert(bs_new->device_name[0] == '\0');
e4654d2d 2024 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
4ddc07ca
PB
2025 assert(bs_new->job == NULL);
2026 assert(bs_new->dev == NULL);
4ddc07ca 2027 assert(bs_new->io_limits_enabled == false);
cc0681c4 2028 assert(!throttle_have_timer(&bs_new->throttle_state));
8802d1fd 2029
4ddc07ca
PB
2030 tmp = *bs_new;
2031 *bs_new = *bs_old;
2032 *bs_old = tmp;
a9fc4408 2033
4ddc07ca
PB
2034 /* there are some fields that should not be swapped, move them back */
2035 bdrv_move_feature_fields(&tmp, bs_old);
2036 bdrv_move_feature_fields(bs_old, bs_new);
2037 bdrv_move_feature_fields(bs_new, &tmp);
8802d1fd 2038
4ddc07ca
PB
2039 /* bs_new shouldn't be in bdrv_states even after the swap! */
2040 assert(bs_new->device_name[0] == '\0');
2041
2042 /* Check a few fields that should remain attached to the device */
2043 assert(bs_new->dev == NULL);
2044 assert(bs_new->job == NULL);
4ddc07ca 2045 assert(bs_new->io_limits_enabled == false);
cc0681c4 2046 assert(!throttle_have_timer(&bs_new->throttle_state));
e023b2e2 2047
90ce8a06
BC
2048 /* insert the nodes back into the graph node list if needed */
2049 if (bs_new->node_name[0] != '\0') {
2050 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2051 }
2052 if (bs_old->node_name[0] != '\0') {
2053 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2054 }
2055
e023b2e2 2056 bdrv_rebind(bs_new);
4ddc07ca
PB
2057 bdrv_rebind(bs_old);
2058}
2059
2060/*
2061 * Add new bs contents at the top of an image chain while the chain is
2062 * live, while keeping required fields on the top layer.
2063 *
2064 * This will modify the BlockDriverState fields, and swap contents
2065 * between bs_new and bs_top. Both bs_new and bs_top are modified.
2066 *
2067 * bs_new is required to be anonymous.
2068 *
2069 * This function does not create any image files.
2070 */
2071void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2072{
2073 bdrv_swap(bs_new, bs_top);
2074
2075 /* The contents of 'tmp' will become bs_top, as we are
2076 * swapping bs_new and bs_top contents. */
8d24cce1 2077 bdrv_set_backing_hd(bs_top, bs_new);
8802d1fd
JC
2078}
2079
4f6fd349 2080static void bdrv_delete(BlockDriverState *bs)
b338082b 2081{
fa879d62 2082 assert(!bs->dev);
3e914655 2083 assert(!bs->job);
3718d8ab 2084 assert(bdrv_op_blocker_is_empty(bs));
4f6fd349 2085 assert(!bs->refcnt);
e4654d2d 2086 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
18846dee 2087
e1b5c52e
SH
2088 bdrv_close(bs);
2089
1b7bdbc1 2090 /* remove from list, if necessary */
d22b2f41 2091 bdrv_make_anon(bs);
34c6f050 2092
7267c094 2093 g_free(bs);
fc01f7e7
FB
2094}
2095
fa879d62
MA
2096int bdrv_attach_dev(BlockDriverState *bs, void *dev)
2097/* TODO change to DeviceState *dev when all users are qdevified */
18846dee 2098{
fa879d62 2099 if (bs->dev) {
18846dee
MA
2100 return -EBUSY;
2101 }
fa879d62 2102 bs->dev = dev;
28a7282a 2103 bdrv_iostatus_reset(bs);
18846dee
MA
2104 return 0;
2105}
2106
fa879d62
MA
2107/* TODO qdevified devices don't use this, remove when devices are qdevified */
2108void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
18846dee 2109{
fa879d62
MA
2110 if (bdrv_attach_dev(bs, dev) < 0) {
2111 abort();
2112 }
2113}
2114
2115void bdrv_detach_dev(BlockDriverState *bs, void *dev)
2116/* TODO change to DeviceState *dev when all users are qdevified */
2117{
2118 assert(bs->dev == dev);
2119 bs->dev = NULL;
0e49de52
MA
2120 bs->dev_ops = NULL;
2121 bs->dev_opaque = NULL;
1b7fd729 2122 bs->guest_block_size = 512;
18846dee
MA
2123}
2124
fa879d62
MA
2125/* TODO change to return DeviceState * when all users are qdevified */
2126void *bdrv_get_attached_dev(BlockDriverState *bs)
18846dee 2127{
fa879d62 2128 return bs->dev;
18846dee
MA
2129}
2130
0e49de52
MA
2131void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
2132 void *opaque)
2133{
2134 bs->dev_ops = ops;
2135 bs->dev_opaque = opaque;
2136}
2137
32c81a4a
PB
2138void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
2139 enum MonitorEvent ev,
2140 BlockErrorAction action, bool is_read)
329c0a48
LC
2141{
2142 QObject *data;
2143 const char *action_str;
2144
2145 switch (action) {
2146 case BDRV_ACTION_REPORT:
2147 action_str = "report";
2148 break;
2149 case BDRV_ACTION_IGNORE:
2150 action_str = "ignore";
2151 break;
2152 case BDRV_ACTION_STOP:
2153 action_str = "stop";
2154 break;
2155 default:
2156 abort();
2157 }
2158
2159 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
2160 bdrv->device_name,
2161 action_str,
2162 is_read ? "read" : "write");
32c81a4a 2163 monitor_protocol_event(ev, data);
329c0a48
LC
2164
2165 qobject_decref(data);
2166}
2167
6f382ed2
LC
2168static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
2169{
2170 QObject *data;
2171
2172 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
2173 bdrv_get_device_name(bs), ejected);
2174 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
2175
2176 qobject_decref(data);
2177}
2178
7d4b4ba5 2179static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
0e49de52 2180{
145feb17 2181 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
6f382ed2 2182 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
7d4b4ba5 2183 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
6f382ed2
LC
2184 if (tray_was_closed) {
2185 /* tray open */
2186 bdrv_emit_qmp_eject_event(bs, true);
2187 }
2188 if (load) {
2189 /* tray close */
2190 bdrv_emit_qmp_eject_event(bs, false);
2191 }
145feb17
MA
2192 }
2193}
2194
2c6942fa
MA
2195bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2196{
2197 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2198}
2199
025ccaa7
PB
2200void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2201{
2202 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2203 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2204 }
2205}
2206
e4def80b
MA
2207bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2208{
2209 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2210 return bs->dev_ops->is_tray_open(bs->dev_opaque);
2211 }
2212 return false;
2213}
2214
145feb17
MA
2215static void bdrv_dev_resize_cb(BlockDriverState *bs)
2216{
2217 if (bs->dev_ops && bs->dev_ops->resize_cb) {
2218 bs->dev_ops->resize_cb(bs->dev_opaque);
0e49de52
MA
2219 }
2220}
2221
f107639a
MA
2222bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2223{
2224 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2225 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2226 }
2227 return false;
2228}
2229
e97fc193
AL
2230/*
2231 * Run consistency checks on an image
2232 *
e076f338 2233 * Returns 0 if the check could be completed (it doesn't mean that the image is
a1c7273b 2234 * free of errors) or -errno when an internal error occurred. The results of the
e076f338 2235 * check are stored in res.
e97fc193 2236 */
4534ff54 2237int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
e97fc193
AL
2238{
2239 if (bs->drv->bdrv_check == NULL) {
2240 return -ENOTSUP;
2241 }
2242
e076f338 2243 memset(res, 0, sizeof(*res));
4534ff54 2244 return bs->drv->bdrv_check(bs, res, fix);
e97fc193
AL
2245}
2246
8a426614
KW
2247#define COMMIT_BUF_SECTORS 2048
2248
33e3963e
FB
2249/* commit COW file into the raw image */
2250int bdrv_commit(BlockDriverState *bs)
2251{
19cb3738 2252 BlockDriver *drv = bs->drv;
72706ea4 2253 int64_t sector, total_sectors, length, backing_length;
8a426614 2254 int n, ro, open_flags;
0bce597d 2255 int ret = 0;
72706ea4 2256 uint8_t *buf = NULL;
c2cba3d9 2257 char filename[PATH_MAX];
33e3963e 2258
19cb3738
FB
2259 if (!drv)
2260 return -ENOMEDIUM;
4dca4b63
NS
2261
2262 if (!bs->backing_hd) {
2263 return -ENOTSUP;
33e3963e
FB
2264 }
2265
3718d8ab
FZ
2266 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
2267 bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
2d3735d3
SH
2268 return -EBUSY;
2269 }
2270
4dca4b63 2271 ro = bs->backing_hd->read_only;
c2cba3d9
JM
2272 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2273 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
4dca4b63
NS
2274 open_flags = bs->backing_hd->open_flags;
2275
2276 if (ro) {
0bce597d
JC
2277 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2278 return -EACCES;
4dca4b63 2279 }
ea2384d3 2280 }
33e3963e 2281
72706ea4
JC
2282 length = bdrv_getlength(bs);
2283 if (length < 0) {
2284 ret = length;
2285 goto ro_cleanup;
2286 }
2287
2288 backing_length = bdrv_getlength(bs->backing_hd);
2289 if (backing_length < 0) {
2290 ret = backing_length;
2291 goto ro_cleanup;
2292 }
2293
2294 /* If our top snapshot is larger than the backing file image,
2295 * grow the backing file image if possible. If not possible,
2296 * we must return an error */
2297 if (length > backing_length) {
2298 ret = bdrv_truncate(bs->backing_hd, length);
2299 if (ret < 0) {
2300 goto ro_cleanup;
2301 }
2302 }
2303
2304 total_sectors = length >> BDRV_SECTOR_BITS;
7267c094 2305 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
8a426614
KW
2306
2307 for (sector = 0; sector < total_sectors; sector += n) {
d663640c
PB
2308 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2309 if (ret < 0) {
2310 goto ro_cleanup;
2311 }
2312 if (ret) {
dabfa6cc
KW
2313 ret = bdrv_read(bs, sector, buf, n);
2314 if (ret < 0) {
8a426614
KW
2315 goto ro_cleanup;
2316 }
2317
dabfa6cc
KW
2318 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2319 if (ret < 0) {
8a426614
KW
2320 goto ro_cleanup;
2321 }
ea2384d3 2322 }
33e3963e 2323 }
95389c86 2324
1d44952f
CH
2325 if (drv->bdrv_make_empty) {
2326 ret = drv->bdrv_make_empty(bs);
dabfa6cc
KW
2327 if (ret < 0) {
2328 goto ro_cleanup;
2329 }
1d44952f
CH
2330 bdrv_flush(bs);
2331 }
95389c86 2332
3f5075ae
CH
2333 /*
2334 * Make sure all data we wrote to the backing device is actually
2335 * stable on disk.
2336 */
dabfa6cc 2337 if (bs->backing_hd) {
3f5075ae 2338 bdrv_flush(bs->backing_hd);
dabfa6cc 2339 }
4dca4b63 2340
dabfa6cc 2341 ret = 0;
4dca4b63 2342ro_cleanup:
7267c094 2343 g_free(buf);
4dca4b63
NS
2344
2345 if (ro) {
0bce597d
JC
2346 /* ignoring error return here */
2347 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
4dca4b63
NS
2348 }
2349
1d44952f 2350 return ret;
33e3963e
FB
2351}
2352
e8877497 2353int bdrv_commit_all(void)
6ab4b5ab
MA
2354{
2355 BlockDriverState *bs;
2356
dc364f4c 2357 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
2358 AioContext *aio_context = bdrv_get_aio_context(bs);
2359
2360 aio_context_acquire(aio_context);
272d2d8e
JC
2361 if (bs->drv && bs->backing_hd) {
2362 int ret = bdrv_commit(bs);
2363 if (ret < 0) {
ed78cda3 2364 aio_context_release(aio_context);
272d2d8e
JC
2365 return ret;
2366 }
e8877497 2367 }
ed78cda3 2368 aio_context_release(aio_context);
6ab4b5ab 2369 }
e8877497 2370 return 0;
6ab4b5ab
MA
2371}
2372
dbffbdcf
SH
2373/**
2374 * Remove an active request from the tracked requests list
2375 *
2376 * This function should be called when a tracked request is completing.
2377 */
2378static void tracked_request_end(BdrvTrackedRequest *req)
2379{
2dbafdc0
KW
2380 if (req->serialising) {
2381 req->bs->serialising_in_flight--;
2382 }
2383
dbffbdcf 2384 QLIST_REMOVE(req, list);
f4658285 2385 qemu_co_queue_restart_all(&req->wait_queue);
dbffbdcf
SH
2386}
2387
2388/**
2389 * Add an active request to the tracked requests list
2390 */
2391static void tracked_request_begin(BdrvTrackedRequest *req,
2392 BlockDriverState *bs,
793ed47a
KW
2393 int64_t offset,
2394 unsigned int bytes, bool is_write)
dbffbdcf
SH
2395{
2396 *req = (BdrvTrackedRequest){
2397 .bs = bs,
2dbafdc0
KW
2398 .offset = offset,
2399 .bytes = bytes,
2400 .is_write = is_write,
2401 .co = qemu_coroutine_self(),
2402 .serialising = false,
7327145f
KW
2403 .overlap_offset = offset,
2404 .overlap_bytes = bytes,
dbffbdcf
SH
2405 };
2406
f4658285
SH
2407 qemu_co_queue_init(&req->wait_queue);
2408
dbffbdcf
SH
2409 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2410}
2411
e96126ff 2412static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2dbafdc0 2413{
7327145f 2414 int64_t overlap_offset = req->offset & ~(align - 1);
e96126ff
KW
2415 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2416 - overlap_offset;
7327145f 2417
2dbafdc0
KW
2418 if (!req->serialising) {
2419 req->bs->serialising_in_flight++;
2420 req->serialising = true;
2421 }
7327145f
KW
2422
2423 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2424 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2dbafdc0
KW
2425}
2426
d83947ac
SH
2427/**
2428 * Round a region to cluster boundaries
2429 */
343bded4
PB
2430void bdrv_round_to_clusters(BlockDriverState *bs,
2431 int64_t sector_num, int nb_sectors,
2432 int64_t *cluster_sector_num,
2433 int *cluster_nb_sectors)
d83947ac
SH
2434{
2435 BlockDriverInfo bdi;
2436
2437 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2438 *cluster_sector_num = sector_num;
2439 *cluster_nb_sectors = nb_sectors;
2440 } else {
2441 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2442 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2443 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2444 nb_sectors, c);
2445 }
2446}
2447
7327145f 2448static int bdrv_get_cluster_size(BlockDriverState *bs)
793ed47a
KW
2449{
2450 BlockDriverInfo bdi;
7327145f 2451 int ret;
793ed47a 2452
7327145f
KW
2453 ret = bdrv_get_info(bs, &bdi);
2454 if (ret < 0 || bdi.cluster_size == 0) {
2455 return bs->request_alignment;
793ed47a 2456 } else {
7327145f 2457 return bdi.cluster_size;
793ed47a
KW
2458 }
2459}
2460
f4658285 2461static bool tracked_request_overlaps(BdrvTrackedRequest *req,
793ed47a
KW
2462 int64_t offset, unsigned int bytes)
2463{
d83947ac 2464 /* aaaa bbbb */
7327145f 2465 if (offset >= req->overlap_offset + req->overlap_bytes) {
d83947ac
SH
2466 return false;
2467 }
2468 /* bbbb aaaa */
7327145f 2469 if (req->overlap_offset >= offset + bytes) {
d83947ac
SH
2470 return false;
2471 }
2472 return true;
f4658285
SH
2473}
2474
28de2dcd 2475static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
f4658285 2476{
2dbafdc0 2477 BlockDriverState *bs = self->bs;
f4658285
SH
2478 BdrvTrackedRequest *req;
2479 bool retry;
28de2dcd 2480 bool waited = false;
f4658285 2481
2dbafdc0 2482 if (!bs->serialising_in_flight) {
28de2dcd 2483 return false;
2dbafdc0
KW
2484 }
2485
f4658285
SH
2486 do {
2487 retry = false;
2488 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2dbafdc0 2489 if (req == self || (!req->serialising && !self->serialising)) {
65afd211
KW
2490 continue;
2491 }
7327145f
KW
2492 if (tracked_request_overlaps(req, self->overlap_offset,
2493 self->overlap_bytes))
2494 {
5f8b6491
SH
2495 /* Hitting this means there was a reentrant request, for
2496 * example, a block driver issuing nested requests. This must
2497 * never happen since it means deadlock.
2498 */
2499 assert(qemu_coroutine_self() != req->co);
2500
6460440f
KW
2501 /* If the request is already (indirectly) waiting for us, or
2502 * will wait for us as soon as it wakes up, then just go on
2503 * (instead of producing a deadlock in the former case). */
2504 if (!req->waiting_for) {
2505 self->waiting_for = req;
2506 qemu_co_queue_wait(&req->wait_queue);
2507 self->waiting_for = NULL;
2508 retry = true;
28de2dcd 2509 waited = true;
6460440f
KW
2510 break;
2511 }
f4658285
SH
2512 }
2513 }
2514 } while (retry);
28de2dcd
KW
2515
2516 return waited;
f4658285
SH
2517}
2518
756e6736
KW
2519/*
2520 * Return values:
2521 * 0 - success
2522 * -EINVAL - backing format specified, but no file
2523 * -ENOSPC - can't update the backing file because no space is left in the
2524 * image file header
2525 * -ENOTSUP - format driver doesn't support changing the backing file
2526 */
2527int bdrv_change_backing_file(BlockDriverState *bs,
2528 const char *backing_file, const char *backing_fmt)
2529{
2530 BlockDriver *drv = bs->drv;
469ef350 2531 int ret;
756e6736 2532
5f377794
PB
2533 /* Backing file format doesn't make sense without a backing file */
2534 if (backing_fmt && !backing_file) {
2535 return -EINVAL;
2536 }
2537
756e6736 2538 if (drv->bdrv_change_backing_file != NULL) {
469ef350 2539 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
756e6736 2540 } else {
469ef350 2541 ret = -ENOTSUP;
756e6736 2542 }
469ef350
PB
2543
2544 if (ret == 0) {
2545 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2546 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2547 }
2548 return ret;
756e6736
KW
2549}
2550
6ebdcee2
JC
2551/*
2552 * Finds the image layer in the chain that has 'bs' as its backing file.
2553 *
2554 * active is the current topmost image.
2555 *
2556 * Returns NULL if bs is not found in active's image chain,
2557 * or if active == bs.
2558 */
2559BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2560 BlockDriverState *bs)
2561{
2562 BlockDriverState *overlay = NULL;
2563 BlockDriverState *intermediate;
2564
2565 assert(active != NULL);
2566 assert(bs != NULL);
2567
2568 /* if bs is the same as active, then by definition it has no overlay
2569 */
2570 if (active == bs) {
2571 return NULL;
2572 }
2573
2574 intermediate = active;
2575 while (intermediate->backing_hd) {
2576 if (intermediate->backing_hd == bs) {
2577 overlay = intermediate;
2578 break;
2579 }
2580 intermediate = intermediate->backing_hd;
2581 }
2582
2583 return overlay;
2584}
2585
2586typedef struct BlkIntermediateStates {
2587 BlockDriverState *bs;
2588 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2589} BlkIntermediateStates;
2590
2591
2592/*
2593 * Drops images above 'base' up to and including 'top', and sets the image
2594 * above 'top' to have base as its backing file.
2595 *
2596 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2597 * information in 'bs' can be properly updated.
2598 *
2599 * E.g., this will convert the following chain:
2600 * bottom <- base <- intermediate <- top <- active
2601 *
2602 * to
2603 *
2604 * bottom <- base <- active
2605 *
2606 * It is allowed for bottom==base, in which case it converts:
2607 *
2608 * base <- intermediate <- top <- active
2609 *
2610 * to
2611 *
2612 * base <- active
2613 *
2614 * Error conditions:
2615 * if active == top, that is considered an error
2616 *
2617 */
2618int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2619 BlockDriverState *base)
2620{
2621 BlockDriverState *intermediate;
2622 BlockDriverState *base_bs = NULL;
2623 BlockDriverState *new_top_bs = NULL;
2624 BlkIntermediateStates *intermediate_state, *next;
2625 int ret = -EIO;
2626
2627 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2628 QSIMPLEQ_INIT(&states_to_delete);
2629
2630 if (!top->drv || !base->drv) {
2631 goto exit;
2632 }
2633
2634 new_top_bs = bdrv_find_overlay(active, top);
2635
2636 if (new_top_bs == NULL) {
2637 /* we could not find the image above 'top', this is an error */
2638 goto exit;
2639 }
2640
2641 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2642 * to do, no intermediate images */
2643 if (new_top_bs->backing_hd == base) {
2644 ret = 0;
2645 goto exit;
2646 }
2647
2648 intermediate = top;
2649
2650 /* now we will go down through the list, and add each BDS we find
2651 * into our deletion queue, until we hit the 'base'
2652 */
2653 while (intermediate) {
2654 intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2655 intermediate_state->bs = intermediate;
2656 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2657
2658 if (intermediate->backing_hd == base) {
2659 base_bs = intermediate->backing_hd;
2660 break;
2661 }
2662 intermediate = intermediate->backing_hd;
2663 }
2664 if (base_bs == NULL) {
2665 /* something went wrong, we did not end at the base. safely
2666 * unravel everything, and exit with error */
2667 goto exit;
2668 }
2669
2670 /* success - we can delete the intermediate states, and link top->base */
2671 ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2672 base_bs->drv ? base_bs->drv->format_name : "");
2673 if (ret) {
2674 goto exit;
2675 }
920beae1 2676 bdrv_set_backing_hd(new_top_bs, base_bs);
6ebdcee2
JC
2677
2678 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2679 /* so that bdrv_close() does not recursively close the chain */
920beae1 2680 bdrv_set_backing_hd(intermediate_state->bs, NULL);
4f6fd349 2681 bdrv_unref(intermediate_state->bs);
6ebdcee2
JC
2682 }
2683 ret = 0;
2684
2685exit:
2686 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2687 g_free(intermediate_state);
2688 }
2689 return ret;
2690}
2691
2692
71d0770c
AL
2693static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2694 size_t size)
2695{
2696 int64_t len;
2697
1dd3a447
KW
2698 if (size > INT_MAX) {
2699 return -EIO;
2700 }
2701
71d0770c
AL
2702 if (!bdrv_is_inserted(bs))
2703 return -ENOMEDIUM;
2704
2705 if (bs->growable)
2706 return 0;
2707
2708 len = bdrv_getlength(bs);
2709
fbb7b4e0
KW
2710 if (offset < 0)
2711 return -EIO;
2712
2713 if ((offset > len) || (len - offset < size))
71d0770c
AL
2714 return -EIO;
2715
2716 return 0;
2717}
2718
2719static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2720 int nb_sectors)
2721{
54db38a4 2722 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
8f4754ed
KW
2723 return -EIO;
2724 }
2725
eb5a3165
JS
2726 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2727 nb_sectors * BDRV_SECTOR_SIZE);
71d0770c
AL
2728}
2729
1c9805a3
SH
2730typedef struct RwCo {
2731 BlockDriverState *bs;
775aa8b6 2732 int64_t offset;
1c9805a3
SH
2733 QEMUIOVector *qiov;
2734 bool is_write;
2735 int ret;
4105eaaa 2736 BdrvRequestFlags flags;
1c9805a3
SH
2737} RwCo;
2738
2739static void coroutine_fn bdrv_rw_co_entry(void *opaque)
fc01f7e7 2740{
1c9805a3 2741 RwCo *rwco = opaque;
ea2384d3 2742
1c9805a3 2743 if (!rwco->is_write) {
775aa8b6
KW
2744 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2745 rwco->qiov->size, rwco->qiov,
4105eaaa 2746 rwco->flags);
775aa8b6
KW
2747 } else {
2748 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2749 rwco->qiov->size, rwco->qiov,
2750 rwco->flags);
1c9805a3
SH
2751 }
2752}
e7a8a783 2753
1c9805a3 2754/*
8d3b1a2d 2755 * Process a vectored synchronous request using coroutines
1c9805a3 2756 */
775aa8b6
KW
2757static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2758 QEMUIOVector *qiov, bool is_write,
2759 BdrvRequestFlags flags)
1c9805a3 2760{
1c9805a3
SH
2761 Coroutine *co;
2762 RwCo rwco = {
2763 .bs = bs,
775aa8b6 2764 .offset = offset,
8d3b1a2d 2765 .qiov = qiov,
1c9805a3
SH
2766 .is_write = is_write,
2767 .ret = NOT_DONE,
4105eaaa 2768 .flags = flags,
1c9805a3 2769 };
e7a8a783 2770
498e386c
ZYW
2771 /**
2772 * In sync call context, when the vcpu is blocked, this throttling timer
2773 * will not fire; so the I/O throttling function has to be disabled here
2774 * if it has been enabled.
2775 */
2776 if (bs->io_limits_enabled) {
2777 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2778 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2779 bdrv_io_limits_disable(bs);
2780 }
2781
1c9805a3
SH
2782 if (qemu_in_coroutine()) {
2783 /* Fast-path if already in coroutine context */
2784 bdrv_rw_co_entry(&rwco);
2785 } else {
2572b37a
SH
2786 AioContext *aio_context = bdrv_get_aio_context(bs);
2787
1c9805a3
SH
2788 co = qemu_coroutine_create(bdrv_rw_co_entry);
2789 qemu_coroutine_enter(co, &rwco);
2790 while (rwco.ret == NOT_DONE) {
2572b37a 2791 aio_poll(aio_context, true);
1c9805a3
SH
2792 }
2793 }
2794 return rwco.ret;
2795}
b338082b 2796
8d3b1a2d
KW
2797/*
2798 * Process a synchronous request using coroutines
2799 */
2800static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
4105eaaa 2801 int nb_sectors, bool is_write, BdrvRequestFlags flags)
8d3b1a2d
KW
2802{
2803 QEMUIOVector qiov;
2804 struct iovec iov = {
2805 .iov_base = (void *)buf,
2806 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2807 };
2808
da15ee51
KW
2809 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2810 return -EINVAL;
2811 }
2812
8d3b1a2d 2813 qemu_iovec_init_external(&qiov, &iov, 1);
775aa8b6
KW
2814 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2815 &qiov, is_write, flags);
8d3b1a2d
KW
2816}
2817
1c9805a3
SH
2818/* return < 0 if error. See bdrv_write() for the return codes */
2819int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2820 uint8_t *buf, int nb_sectors)
2821{
4105eaaa 2822 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
fc01f7e7
FB
2823}
2824
07d27a44
MA
2825/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2826int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2827 uint8_t *buf, int nb_sectors)
2828{
2829 bool enabled;
2830 int ret;
2831
2832 enabled = bs->io_limits_enabled;
2833 bs->io_limits_enabled = false;
4e7395e8 2834 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
07d27a44
MA
2835 bs->io_limits_enabled = enabled;
2836 return ret;
2837}
2838
5fafdf24 2839/* Return < 0 if error. Important errors are:
19cb3738
FB
2840 -EIO generic I/O error (may happen for all errors)
2841 -ENOMEDIUM No media inserted.
2842 -EINVAL Invalid sector number or nb_sectors
2843 -EACCES Trying to write a read-only device
2844*/
5fafdf24 2845int bdrv_write(BlockDriverState *bs, int64_t sector_num,
fc01f7e7
FB
2846 const uint8_t *buf, int nb_sectors)
2847{
4105eaaa 2848 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
83f64091
FB
2849}
2850
aa7bfbff
PL
2851int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2852 int nb_sectors, BdrvRequestFlags flags)
4105eaaa
PL
2853{
2854 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
aa7bfbff 2855 BDRV_REQ_ZERO_WRITE | flags);
8d3b1a2d
KW
2856}
2857
d75cbb5e
PL
2858/*
2859 * Completely zero out a block device with the help of bdrv_write_zeroes.
2860 * The operation is sped up by checking the block status and only writing
2861 * zeroes to the device if they currently do not return zeroes. Optional
2862 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2863 *
2864 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2865 */
2866int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2867{
9ce10c0b 2868 int64_t target_size;
d75cbb5e
PL
2869 int64_t ret, nb_sectors, sector_num = 0;
2870 int n;
2871
9ce10c0b
KW
2872 target_size = bdrv_getlength(bs);
2873 if (target_size < 0) {
2874 return target_size;
2875 }
2876 target_size /= BDRV_SECTOR_SIZE;
2877
d75cbb5e
PL
2878 for (;;) {
2879 nb_sectors = target_size - sector_num;
2880 if (nb_sectors <= 0) {
2881 return 0;
2882 }
2883 if (nb_sectors > INT_MAX) {
2884 nb_sectors = INT_MAX;
2885 }
2886 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
3d94ce60
PL
2887 if (ret < 0) {
2888 error_report("error getting block status at sector %" PRId64 ": %s",
2889 sector_num, strerror(-ret));
2890 return ret;
2891 }
d75cbb5e
PL
2892 if (ret & BDRV_BLOCK_ZERO) {
2893 sector_num += n;
2894 continue;
2895 }
2896 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2897 if (ret < 0) {
2898 error_report("error writing zeroes at sector %" PRId64 ": %s",
2899 sector_num, strerror(-ret));
2900 return ret;
2901 }
2902 sector_num += n;
2903 }
2904}
2905
a3ef6571 2906int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
83f64091 2907{
a3ef6571
KW
2908 QEMUIOVector qiov;
2909 struct iovec iov = {
2910 .iov_base = (void *)buf,
2911 .iov_len = bytes,
2912 };
9a8c4cce 2913 int ret;
83f64091 2914
a3ef6571
KW
2915 if (bytes < 0) {
2916 return -EINVAL;
83f64091
FB
2917 }
2918
a3ef6571
KW
2919 qemu_iovec_init_external(&qiov, &iov, 1);
2920 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2921 if (ret < 0) {
2922 return ret;
83f64091 2923 }
a3ef6571
KW
2924
2925 return bytes;
83f64091
FB
2926}
2927
8d3b1a2d 2928int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
83f64091 2929{
9a8c4cce 2930 int ret;
83f64091 2931
8407d5d7
KW
2932 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2933 if (ret < 0) {
2934 return ret;
83f64091
FB
2935 }
2936
8d3b1a2d
KW
2937 return qiov->size;
2938}
2939
2940int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
8407d5d7 2941 const void *buf, int bytes)
8d3b1a2d
KW
2942{
2943 QEMUIOVector qiov;
2944 struct iovec iov = {
2945 .iov_base = (void *) buf,
8407d5d7 2946 .iov_len = bytes,
8d3b1a2d
KW
2947 };
2948
8407d5d7
KW
2949 if (bytes < 0) {
2950 return -EINVAL;
2951 }
2952
8d3b1a2d
KW
2953 qemu_iovec_init_external(&qiov, &iov, 1);
2954 return bdrv_pwritev(bs, offset, &qiov);
83f64091 2955}
83f64091 2956
f08145fe
KW
2957/*
2958 * Writes to the file and ensures that no writes are reordered across this
2959 * request (acts as a barrier)
2960 *
2961 * Returns 0 on success, -errno in error cases.
2962 */
2963int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2964 const void *buf, int count)
2965{
2966 int ret;
2967
2968 ret = bdrv_pwrite(bs, offset, buf, count);
2969 if (ret < 0) {
2970 return ret;
2971 }
2972
f05fa4ad
PB
2973 /* No flush needed for cache modes that already do it */
2974 if (bs->enable_write_cache) {
f08145fe
KW
2975 bdrv_flush(bs);
2976 }
2977
2978 return 0;
2979}
2980
470c0504 2981static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
ab185921
SH
2982 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2983{
2984 /* Perform I/O through a temporary buffer so that users who scribble over
2985 * their read buffer while the operation is in progress do not end up
2986 * modifying the image file. This is critical for zero-copy guest I/O
2987 * where anything might happen inside guest memory.
2988 */
2989 void *bounce_buffer;
2990
79c053bd 2991 BlockDriver *drv = bs->drv;
ab185921
SH
2992 struct iovec iov;
2993 QEMUIOVector bounce_qiov;
2994 int64_t cluster_sector_num;
2995 int cluster_nb_sectors;
2996 size_t skip_bytes;
2997 int ret;
2998
2999 /* Cover entire cluster so no additional backing file I/O is required when
3000 * allocating cluster in the image file.
3001 */
343bded4
PB
3002 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
3003 &cluster_sector_num, &cluster_nb_sectors);
ab185921 3004
470c0504
SH
3005 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
3006 cluster_sector_num, cluster_nb_sectors);
ab185921
SH
3007
3008 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
3009 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
3010 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
3011
79c053bd
SH
3012 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
3013 &bounce_qiov);
ab185921
SH
3014 if (ret < 0) {
3015 goto err;
3016 }
3017
79c053bd
SH
3018 if (drv->bdrv_co_write_zeroes &&
3019 buffer_is_zero(bounce_buffer, iov.iov_len)) {
621f0589 3020 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
aa7bfbff 3021 cluster_nb_sectors, 0);
79c053bd 3022 } else {
f05fa4ad
PB
3023 /* This does not change the data on the disk, it is not necessary
3024 * to flush even in cache=writethrough mode.
3025 */
79c053bd 3026 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
ab185921 3027 &bounce_qiov);
79c053bd
SH
3028 }
3029
ab185921
SH
3030 if (ret < 0) {
3031 /* It might be okay to ignore write errors for guest requests. If this
3032 * is a deliberate copy-on-read then we don't want to ignore the error.
3033 * Simply report it in all cases.
3034 */
3035 goto err;
3036 }
3037
3038 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
03396148
MT
3039 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
3040 nb_sectors * BDRV_SECTOR_SIZE);
ab185921
SH
3041
3042err:
3043 qemu_vfree(bounce_buffer);
3044 return ret;
3045}
3046
c5fbe571 3047/*
d0c7f642
KW
3048 * Forwards an already correctly aligned request to the BlockDriver. This
3049 * handles copy on read and zeroing after EOF; any other features must be
3050 * implemented by the caller.
c5fbe571 3051 */
d0c7f642 3052static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
65afd211 3053 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
ec746e10 3054 int64_t align, QEMUIOVector *qiov, int flags)
da1fa91d
KW
3055{
3056 BlockDriver *drv = bs->drv;
dbffbdcf 3057 int ret;
da1fa91d 3058
d0c7f642
KW
3059 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3060 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
da1fa91d 3061
d0c7f642
KW
3062 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3063 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3064
3065 /* Handle Copy on Read and associated serialisation */
470c0504 3066 if (flags & BDRV_REQ_COPY_ON_READ) {
7327145f
KW
3067 /* If we touch the same cluster it counts as an overlap. This
3068 * guarantees that allocating writes will be serialized and not race
3069 * with each other for the same cluster. For example, in copy-on-read
3070 * it ensures that the CoR read and write operations are atomic and
3071 * guest writes cannot interleave between them. */
3072 mark_request_serialising(req, bdrv_get_cluster_size(bs));
470c0504
SH
3073 }
3074
2dbafdc0 3075 wait_serialising_requests(req);
f4658285 3076
470c0504 3077 if (flags & BDRV_REQ_COPY_ON_READ) {
ab185921
SH
3078 int pnum;
3079
bdad13b9 3080 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
ab185921
SH
3081 if (ret < 0) {
3082 goto out;
3083 }
3084
3085 if (!ret || pnum != nb_sectors) {
470c0504 3086 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
ab185921
SH
3087 goto out;
3088 }
3089 }
3090
d0c7f642 3091 /* Forward the request to the BlockDriver */
893a8f62
MK
3092 if (!(bs->zero_beyond_eof && bs->growable)) {
3093 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3094 } else {
3095 /* Read zeros after EOF of growable BDSes */
3096 int64_t len, total_sectors, max_nb_sectors;
3097
3098 len = bdrv_getlength(bs);
3099 if (len < 0) {
3100 ret = len;
3101 goto out;
3102 }
3103
d055a1fe 3104 total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
5f5bcd80
KW
3105 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3106 align >> BDRV_SECTOR_BITS);
893a8f62
MK
3107 if (max_nb_sectors > 0) {
3108 ret = drv->bdrv_co_readv(bs, sector_num,
3109 MIN(nb_sectors, max_nb_sectors), qiov);
3110 } else {
3111 ret = 0;
3112 }
3113
3114 /* Reading beyond end of file is supposed to produce zeroes */
3115 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3116 uint64_t offset = MAX(0, total_sectors - sector_num);
3117 uint64_t bytes = (sector_num + nb_sectors - offset) *
3118 BDRV_SECTOR_SIZE;
3119 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3120 }
3121 }
ab185921
SH
3122
3123out:
dbffbdcf 3124 return ret;
da1fa91d
KW
3125}
3126
d0c7f642
KW
3127/*
3128 * Handle a read request in coroutine context
3129 */
1b0288ae
KW
3130static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3131 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
d0c7f642
KW
3132 BdrvRequestFlags flags)
3133{
3134 BlockDriver *drv = bs->drv;
65afd211
KW
3135 BdrvTrackedRequest req;
3136
1b0288ae
KW
3137 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3138 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3139 uint8_t *head_buf = NULL;
3140 uint8_t *tail_buf = NULL;
3141 QEMUIOVector local_qiov;
3142 bool use_local_qiov = false;
d0c7f642
KW
3143 int ret;
3144
3145 if (!drv) {
3146 return -ENOMEDIUM;
3147 }
1b0288ae 3148 if (bdrv_check_byte_request(bs, offset, bytes)) {
d0c7f642
KW
3149 return -EIO;
3150 }
3151
3152 if (bs->copy_on_read) {
3153 flags |= BDRV_REQ_COPY_ON_READ;
3154 }
3155
3156 /* throttling disk I/O */
3157 if (bs->io_limits_enabled) {
d5103588 3158 bdrv_io_limits_intercept(bs, bytes, false);
1b0288ae
KW
3159 }
3160
3161 /* Align read if necessary by padding qiov */
3162 if (offset & (align - 1)) {
3163 head_buf = qemu_blockalign(bs, align);
3164 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3165 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3166 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3167 use_local_qiov = true;
3168
3169 bytes += offset & (align - 1);
3170 offset = offset & ~(align - 1);
3171 }
3172
3173 if ((offset + bytes) & (align - 1)) {
3174 if (!use_local_qiov) {
3175 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3176 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3177 use_local_qiov = true;
3178 }
3179 tail_buf = qemu_blockalign(bs, align);
3180 qemu_iovec_add(&local_qiov, tail_buf,
3181 align - ((offset + bytes) & (align - 1)));
3182
3183 bytes = ROUND_UP(bytes, align);
3184 }
3185
65afd211 3186 tracked_request_begin(&req, bs, offset, bytes, false);
ec746e10 3187 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
1b0288ae
KW
3188 use_local_qiov ? &local_qiov : qiov,
3189 flags);
65afd211 3190 tracked_request_end(&req);
1b0288ae
KW
3191
3192 if (use_local_qiov) {
3193 qemu_iovec_destroy(&local_qiov);
3194 qemu_vfree(head_buf);
3195 qemu_vfree(tail_buf);
d0c7f642
KW
3196 }
3197
d0c7f642
KW
3198 return ret;
3199}
3200
1b0288ae
KW
3201static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3202 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3203 BdrvRequestFlags flags)
3204{
3205 if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3206 return -EINVAL;
3207 }
3208
3209 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3210 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3211}
3212
c5fbe571 3213int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
da1fa91d
KW
3214 int nb_sectors, QEMUIOVector *qiov)
3215{
c5fbe571 3216 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
da1fa91d 3217
470c0504
SH
3218 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3219}
3220
3221int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3222 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3223{
3224 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3225
3226 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3227 BDRV_REQ_COPY_ON_READ);
c5fbe571
SH
3228}
3229
c31cb707
PL
3230/* if no limit is specified in the BlockLimits use a default
3231 * of 32768 512-byte sectors (16 MiB) per request.
3232 */
3233#define MAX_WRITE_ZEROES_DEFAULT 32768
3234
f08f2dda 3235static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
aa7bfbff 3236 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
f08f2dda
SH
3237{
3238 BlockDriver *drv = bs->drv;
3239 QEMUIOVector qiov;
c31cb707
PL
3240 struct iovec iov = {0};
3241 int ret = 0;
f08f2dda 3242
c31cb707
PL
3243 int max_write_zeroes = bs->bl.max_write_zeroes ?
3244 bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
621f0589 3245
c31cb707
PL
3246 while (nb_sectors > 0 && !ret) {
3247 int num = nb_sectors;
3248
b8d71c09
PB
3249 /* Align request. Block drivers can expect the "bulk" of the request
3250 * to be aligned.
3251 */
3252 if (bs->bl.write_zeroes_alignment
3253 && num > bs->bl.write_zeroes_alignment) {
3254 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3255 /* Make a small request up to the first aligned sector. */
c31cb707 3256 num = bs->bl.write_zeroes_alignment;
b8d71c09
PB
3257 num -= sector_num % bs->bl.write_zeroes_alignment;
3258 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3259 /* Shorten the request to the last aligned sector. num cannot
3260 * underflow because num > bs->bl.write_zeroes_alignment.
3261 */
3262 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
c31cb707 3263 }
621f0589 3264 }
f08f2dda 3265
c31cb707
PL
3266 /* limit request size */
3267 if (num > max_write_zeroes) {
3268 num = max_write_zeroes;
3269 }
3270
3271 ret = -ENOTSUP;
3272 /* First try the efficient write zeroes operation */
3273 if (drv->bdrv_co_write_zeroes) {
3274 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3275 }
3276
3277 if (ret == -ENOTSUP) {
3278 /* Fall back to bounce buffer if write zeroes is unsupported */
3279 iov.iov_len = num * BDRV_SECTOR_SIZE;
3280 if (iov.iov_base == NULL) {
b8d71c09
PB
3281 iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
3282 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
c31cb707
PL
3283 }
3284 qemu_iovec_init_external(&qiov, &iov, 1);
f08f2dda 3285
c31cb707 3286 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
b8d71c09
PB
3287
3288 /* Keep bounce buffer around if it is big enough for all
3289 * all future requests.
3290 */
3291 if (num < max_write_zeroes) {
3292 qemu_vfree(iov.iov_base);
3293 iov.iov_base = NULL;
3294 }
c31cb707
PL
3295 }
3296
3297 sector_num += num;
3298 nb_sectors -= num;
3299 }
f08f2dda
SH
3300
3301 qemu_vfree(iov.iov_base);
3302 return ret;
3303}
3304
c5fbe571 3305/*
b404f720 3306 * Forwards an already correctly aligned write request to the BlockDriver.
c5fbe571 3307 */
b404f720 3308static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
65afd211
KW
3309 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3310 QEMUIOVector *qiov, int flags)
c5fbe571
SH
3311{
3312 BlockDriver *drv = bs->drv;
28de2dcd 3313 bool waited;
6b7cb247 3314 int ret;
da1fa91d 3315
b404f720
KW
3316 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3317 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
f4658285 3318
b404f720
KW
3319 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3320 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
cc0681c4 3321
28de2dcd
KW
3322 waited = wait_serialising_requests(req);
3323 assert(!waited || !req->serialising);
af91f9a7
KW
3324 assert(req->overlap_offset <= offset);
3325 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
244eadef 3326
65afd211 3327 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
d616b224 3328
465bee1d
PL
3329 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3330 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3331 qemu_iovec_is_zero(qiov)) {
3332 flags |= BDRV_REQ_ZERO_WRITE;
3333 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3334 flags |= BDRV_REQ_MAY_UNMAP;
3335 }
3336 }
3337
d616b224
SH
3338 if (ret < 0) {
3339 /* Do nothing, write notifier decided to fail this request */
3340 } else if (flags & BDRV_REQ_ZERO_WRITE) {
9e1cb96d 3341 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
aa7bfbff 3342 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
f08f2dda 3343 } else {
9e1cb96d 3344 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
f08f2dda
SH
3345 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3346 }
9e1cb96d 3347 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
6b7cb247 3348
f05fa4ad
PB
3349 if (ret == 0 && !bs->enable_write_cache) {
3350 ret = bdrv_co_flush(bs);
3351 }
3352
e4654d2d 3353 bdrv_set_dirty(bs, sector_num, nb_sectors);
da1fa91d
KW
3354
3355 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3356 bs->wr_highest_sector = sector_num + nb_sectors - 1;
3357 }
df2a6f29
PB
3358 if (bs->growable && ret >= 0) {
3359 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3360 }
da1fa91d 3361
6b7cb247 3362 return ret;
da1fa91d
KW
3363}
3364
b404f720
KW
3365/*
3366 * Handle a write request in coroutine context
3367 */
6601553e
KW
3368static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3369 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
b404f720
KW
3370 BdrvRequestFlags flags)
3371{
65afd211 3372 BdrvTrackedRequest req;
3b8242e0
KW
3373 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3374 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3375 uint8_t *head_buf = NULL;
3376 uint8_t *tail_buf = NULL;
3377 QEMUIOVector local_qiov;
3378 bool use_local_qiov = false;
b404f720
KW
3379 int ret;
3380
3381 if (!bs->drv) {
3382 return -ENOMEDIUM;
3383 }
3384 if (bs->read_only) {
3385 return -EACCES;
3386 }
6601553e 3387 if (bdrv_check_byte_request(bs, offset, bytes)) {
b404f720
KW
3388 return -EIO;
3389 }
3390
b404f720
KW
3391 /* throttling disk I/O */
3392 if (bs->io_limits_enabled) {
d5103588 3393 bdrv_io_limits_intercept(bs, bytes, true);
b404f720
KW
3394 }
3395
3b8242e0
KW
3396 /*
3397 * Align write if necessary by performing a read-modify-write cycle.
3398 * Pad qiov with the read parts and be sure to have a tracked request not
3399 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3400 */
65afd211 3401 tracked_request_begin(&req, bs, offset, bytes, true);
3b8242e0
KW
3402
3403 if (offset & (align - 1)) {
3404 QEMUIOVector head_qiov;
3405 struct iovec head_iov;
3406
3407 mark_request_serialising(&req, align);
3408 wait_serialising_requests(&req);
3409
3410 head_buf = qemu_blockalign(bs, align);
3411 head_iov = (struct iovec) {
3412 .iov_base = head_buf,
3413 .iov_len = align,
3414 };
3415 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3416
9e1cb96d 3417 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3b8242e0
KW
3418 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3419 align, &head_qiov, 0);
3420 if (ret < 0) {
3421 goto fail;
3422 }
9e1cb96d 3423 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3b8242e0
KW
3424
3425 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3426 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3427 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3428 use_local_qiov = true;
3429
3430 bytes += offset & (align - 1);
3431 offset = offset & ~(align - 1);
3432 }
3433
3434 if ((offset + bytes) & (align - 1)) {
3435 QEMUIOVector tail_qiov;
3436 struct iovec tail_iov;
3437 size_t tail_bytes;
28de2dcd 3438 bool waited;
3b8242e0
KW
3439
3440 mark_request_serialising(&req, align);
28de2dcd
KW
3441 waited = wait_serialising_requests(&req);
3442 assert(!waited || !use_local_qiov);
3b8242e0
KW
3443
3444 tail_buf = qemu_blockalign(bs, align);
3445 tail_iov = (struct iovec) {
3446 .iov_base = tail_buf,
3447 .iov_len = align,
3448 };
3449 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3450
9e1cb96d 3451 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3b8242e0
KW
3452 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3453 align, &tail_qiov, 0);
3454 if (ret < 0) {
3455 goto fail;
3456 }
9e1cb96d 3457 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3b8242e0
KW
3458
3459 if (!use_local_qiov) {
3460 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3461 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3462 use_local_qiov = true;
3463 }
3464
3465 tail_bytes = (offset + bytes) & (align - 1);
3466 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3467
3468 bytes = ROUND_UP(bytes, align);
3469 }
3470
3471 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3472 use_local_qiov ? &local_qiov : qiov,
3473 flags);
3474
3475fail:
65afd211 3476 tracked_request_end(&req);
b404f720 3477
3b8242e0
KW
3478 if (use_local_qiov) {
3479 qemu_iovec_destroy(&local_qiov);
3b8242e0 3480 }
99c4a85c
KW
3481 qemu_vfree(head_buf);
3482 qemu_vfree(tail_buf);
3b8242e0 3483
b404f720
KW
3484 return ret;
3485}
3486
6601553e
KW
3487static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3488 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3489 BdrvRequestFlags flags)
3490{
3491 if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3492 return -EINVAL;
3493 }
3494
3495 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3496 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3497}
3498
c5fbe571
SH
3499int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3500 int nb_sectors, QEMUIOVector *qiov)
3501{
3502 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3503
f08f2dda
SH
3504 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3505}
3506
3507int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
aa7bfbff
PL
3508 int64_t sector_num, int nb_sectors,
3509 BdrvRequestFlags flags)
f08f2dda 3510{
94d6ff21 3511 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
f08f2dda 3512
d32f35cb
PL
3513 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3514 flags &= ~BDRV_REQ_MAY_UNMAP;
3515 }
3516
f08f2dda 3517 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
aa7bfbff 3518 BDRV_REQ_ZERO_WRITE | flags);
c5fbe571
SH
3519}
3520
83f64091
FB
3521/**
3522 * Truncate file to 'offset' bytes (needed only for file protocols)
3523 */
3524int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3525{
3526 BlockDriver *drv = bs->drv;
51762288 3527 int ret;
83f64091 3528 if (!drv)
19cb3738 3529 return -ENOMEDIUM;
83f64091
FB
3530 if (!drv->bdrv_truncate)
3531 return -ENOTSUP;
59f2689d
NS
3532 if (bs->read_only)
3533 return -EACCES;
3718d8ab 3534 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_RESIZE, NULL)) {
8591675f 3535 return -EBUSY;
3718d8ab 3536 }
51762288
SH
3537 ret = drv->bdrv_truncate(bs, offset);
3538 if (ret == 0) {
3539 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
145feb17 3540 bdrv_dev_resize_cb(bs);
51762288
SH
3541 }
3542 return ret;
83f64091
FB
3543}
3544
4a1d5e1f
FZ
3545/**
3546 * Length of a allocated file in bytes. Sparse files are counted by actual
3547 * allocated space. Return < 0 if error or unknown.
3548 */
3549int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3550{
3551 BlockDriver *drv = bs->drv;
3552 if (!drv) {
3553 return -ENOMEDIUM;
3554 }
3555 if (drv->bdrv_get_allocated_file_size) {
3556 return drv->bdrv_get_allocated_file_size(bs);
3557 }
3558 if (bs->file) {
3559 return bdrv_get_allocated_file_size(bs->file);
3560 }
3561 return -ENOTSUP;
3562}
3563
83f64091
FB
3564/**
3565 * Length of a file in bytes. Return < 0 if error or unknown.
3566 */
3567int64_t bdrv_getlength(BlockDriverState *bs)
3568{
3569 BlockDriver *drv = bs->drv;
3570 if (!drv)
19cb3738 3571 return -ENOMEDIUM;
51762288 3572
b94a2610
KW
3573 if (drv->has_variable_length) {
3574 int ret = refresh_total_sectors(bs, bs->total_sectors);
3575 if (ret < 0) {
3576 return ret;
46a4e4e6 3577 }
83f64091 3578 }
46a4e4e6 3579 return bs->total_sectors * BDRV_SECTOR_SIZE;
fc01f7e7
FB
3580}
3581
19cb3738 3582/* return 0 as number of sectors if no device present or error */
96b8f136 3583void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
fc01f7e7 3584{
19cb3738
FB
3585 int64_t length;
3586 length = bdrv_getlength(bs);
3587 if (length < 0)
3588 length = 0;
3589 else
6ea44308 3590 length = length >> BDRV_SECTOR_BITS;
19cb3738 3591 *nb_sectors_ptr = length;
fc01f7e7 3592}
cf98951b 3593
ff06f5f3
PB
3594void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3595 BlockdevOnError on_write_error)
abd7f68d
MA
3596{
3597 bs->on_read_error = on_read_error;
3598 bs->on_write_error = on_write_error;
3599}
3600
1ceee0d5 3601BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
abd7f68d
MA
3602{
3603 return is_read ? bs->on_read_error : bs->on_write_error;
3604}
3605
3e1caa5f
PB
3606BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3607{
3608 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3609
3610 switch (on_err) {
3611 case BLOCKDEV_ON_ERROR_ENOSPC:
3612 return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
3613 case BLOCKDEV_ON_ERROR_STOP:
3614 return BDRV_ACTION_STOP;
3615 case BLOCKDEV_ON_ERROR_REPORT:
3616 return BDRV_ACTION_REPORT;
3617 case BLOCKDEV_ON_ERROR_IGNORE:
3618 return BDRV_ACTION_IGNORE;
3619 default:
3620 abort();
3621 }
3622}
3623
3624/* This is done by device models because, while the block layer knows
3625 * about the error, it does not know whether an operation comes from
3626 * the device or the block layer (from a job, for example).
3627 */
3628void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3629 bool is_read, int error)
3630{
3631 assert(error >= 0);
32c81a4a 3632 bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3e1caa5f
PB
3633 if (action == BDRV_ACTION_STOP) {
3634 vm_stop(RUN_STATE_IO_ERROR);
3635 bdrv_iostatus_set_err(bs, error);
3636 }
3637}
3638
b338082b
FB
3639int bdrv_is_read_only(BlockDriverState *bs)
3640{
3641 return bs->read_only;
3642}
3643
985a03b0
TS
3644int bdrv_is_sg(BlockDriverState *bs)
3645{
3646 return bs->sg;
3647}
3648
e900a7b7
CH
3649int bdrv_enable_write_cache(BlockDriverState *bs)
3650{
3651 return bs->enable_write_cache;
3652}
3653
425b0148
PB
3654void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3655{
3656 bs->enable_write_cache = wce;
55b110f2
JC
3657
3658 /* so a reopen() will preserve wce */
3659 if (wce) {
3660 bs->open_flags |= BDRV_O_CACHE_WB;
3661 } else {
3662 bs->open_flags &= ~BDRV_O_CACHE_WB;
3663 }
425b0148
PB
3664}
3665
ea2384d3
FB
3666int bdrv_is_encrypted(BlockDriverState *bs)
3667{
3668 if (bs->backing_hd && bs->backing_hd->encrypted)
3669 return 1;
3670 return bs->encrypted;
3671}
3672
c0f4ce77
AL
3673int bdrv_key_required(BlockDriverState *bs)
3674{
3675 BlockDriverState *backing_hd = bs->backing_hd;
3676
3677 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3678 return 1;
3679 return (bs->encrypted && !bs->valid_key);
3680}
3681
ea2384d3
FB
3682int bdrv_set_key(BlockDriverState *bs, const char *key)
3683{
3684 int ret;
3685 if (bs->backing_hd && bs->backing_hd->encrypted) {
3686 ret = bdrv_set_key(bs->backing_hd, key);
3687 if (ret < 0)
3688 return ret;
3689 if (!bs->encrypted)
3690 return 0;
3691 }
fd04a2ae
SH
3692 if (!bs->encrypted) {
3693 return -EINVAL;
3694 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3695 return -ENOMEDIUM;
3696 }
c0f4ce77 3697 ret = bs->drv->bdrv_set_key(bs, key);
bb5fc20f
AL
3698 if (ret < 0) {
3699 bs->valid_key = 0;
3700 } else if (!bs->valid_key) {
3701 bs->valid_key = 1;
3702 /* call the change callback now, we skipped it on open */
7d4b4ba5 3703 bdrv_dev_change_media_cb(bs, true);
bb5fc20f 3704 }
c0f4ce77 3705 return ret;
ea2384d3
FB
3706}
3707
f8d6bba1 3708const char *bdrv_get_format_name(BlockDriverState *bs)
ea2384d3 3709{
f8d6bba1 3710 return bs->drv ? bs->drv->format_name : NULL;
ea2384d3
FB
3711}
3712
5fafdf24 3713void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
ea2384d3
FB
3714 void *opaque)
3715{
3716 BlockDriver *drv;
e855e4fb
JC
3717 int count = 0;
3718 const char **formats = NULL;
ea2384d3 3719
8a22f02a 3720 QLIST_FOREACH(drv, &bdrv_drivers, list) {
e855e4fb
JC
3721 if (drv->format_name) {
3722 bool found = false;
3723 int i = count;
3724 while (formats && i && !found) {
3725 found = !strcmp(formats[--i], drv->format_name);
3726 }
3727
3728 if (!found) {
3729 formats = g_realloc(formats, (count + 1) * sizeof(char *));
3730 formats[count++] = drv->format_name;
3731 it(opaque, drv->format_name);
3732 }
3733 }
ea2384d3 3734 }
e855e4fb 3735 g_free(formats);
ea2384d3
FB
3736}
3737
dc364f4c 3738/* This function is to find block backend bs */
b338082b
FB
3739BlockDriverState *bdrv_find(const char *name)
3740{
3741 BlockDriverState *bs;
3742
dc364f4c 3743 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1b7bdbc1 3744 if (!strcmp(name, bs->device_name)) {
b338082b 3745 return bs;
1b7bdbc1 3746 }
b338082b
FB
3747 }
3748 return NULL;
3749}
3750
dc364f4c
BC
3751/* This function is to find a node in the bs graph */
3752BlockDriverState *bdrv_find_node(const char *node_name)
3753{
3754 BlockDriverState *bs;
3755
3756 assert(node_name);
3757
3758 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3759 if (!strcmp(node_name, bs->node_name)) {
3760 return bs;
3761 }
3762 }
3763 return NULL;
3764}
3765
c13163fb
BC
3766/* Put this QMP function here so it can access the static graph_bdrv_states. */
3767BlockDeviceInfoList *bdrv_named_nodes_list(void)
3768{
3769 BlockDeviceInfoList *list, *entry;
3770 BlockDriverState *bs;
3771
3772 list = NULL;
3773 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3774 entry = g_malloc0(sizeof(*entry));
3775 entry->value = bdrv_block_device_info(bs);
3776 entry->next = list;
3777 list = entry;
3778 }
3779
3780 return list;
3781}
3782
12d3ba82
BC
3783BlockDriverState *bdrv_lookup_bs(const char *device,
3784 const char *node_name,
3785 Error **errp)
3786{
3787 BlockDriverState *bs = NULL;
3788
12d3ba82
BC
3789 if (device) {
3790 bs = bdrv_find(device);
3791
dd67fa50
BC
3792 if (bs) {
3793 return bs;
12d3ba82 3794 }
12d3ba82
BC
3795 }
3796
dd67fa50
BC
3797 if (node_name) {
3798 bs = bdrv_find_node(node_name);
12d3ba82 3799
dd67fa50
BC
3800 if (bs) {
3801 return bs;
3802 }
12d3ba82
BC
3803 }
3804
dd67fa50
BC
3805 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3806 device ? device : "",
3807 node_name ? node_name : "");
3808 return NULL;
12d3ba82
BC
3809}
3810
2f399b0a
MA
3811BlockDriverState *bdrv_next(BlockDriverState *bs)
3812{
3813 if (!bs) {
3814 return QTAILQ_FIRST(&bdrv_states);
3815 }
dc364f4c 3816 return QTAILQ_NEXT(bs, device_list);
2f399b0a
MA
3817}
3818
51de9760 3819void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
81d0912d
FB
3820{
3821 BlockDriverState *bs;
3822
dc364f4c 3823 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
51de9760 3824 it(opaque, bs);
81d0912d
FB
3825 }
3826}
3827
ea2384d3
FB
3828const char *bdrv_get_device_name(BlockDriverState *bs)
3829{
3830 return bs->device_name;
3831}
3832
c8433287
MA
3833int bdrv_get_flags(BlockDriverState *bs)
3834{
3835 return bs->open_flags;
3836}
3837
f0f0fdfe 3838int bdrv_flush_all(void)
c6ca28d6
AL
3839{
3840 BlockDriverState *bs;
f0f0fdfe 3841 int result = 0;
c6ca28d6 3842
dc364f4c 3843 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
3844 AioContext *aio_context = bdrv_get_aio_context(bs);
3845 int ret;
3846
3847 aio_context_acquire(aio_context);
3848 ret = bdrv_flush(bs);
f0f0fdfe
KW
3849 if (ret < 0 && !result) {
3850 result = ret;
3851 }
ed78cda3 3852 aio_context_release(aio_context);
1b7bdbc1 3853 }
f0f0fdfe
KW
3854
3855 return result;
c6ca28d6
AL
3856}
3857
3ac21627
PL
3858int bdrv_has_zero_init_1(BlockDriverState *bs)
3859{
3860 return 1;
3861}
3862
f2feebbd
KW
3863int bdrv_has_zero_init(BlockDriverState *bs)
3864{
3865 assert(bs->drv);
3866
11212d8f
PB
3867 /* If BS is a copy on write image, it is initialized to
3868 the contents of the base image, which may not be zeroes. */
3869 if (bs->backing_hd) {
3870 return 0;
3871 }
336c1c12
KW
3872 if (bs->drv->bdrv_has_zero_init) {
3873 return bs->drv->bdrv_has_zero_init(bs);
f2feebbd
KW
3874 }
3875
3ac21627
PL
3876 /* safe default */
3877 return 0;
f2feebbd
KW
3878}
3879
4ce78691
PL
3880bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3881{
3882 BlockDriverInfo bdi;
3883
3884 if (bs->backing_hd) {
3885 return false;
3886 }
3887
3888 if (bdrv_get_info(bs, &bdi) == 0) {
3889 return bdi.unallocated_blocks_are_zero;
3890 }
3891
3892 return false;
3893}
3894
3895bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3896{
3897 BlockDriverInfo bdi;
3898
3899 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3900 return false;
3901 }
3902
3903 if (bdrv_get_info(bs, &bdi) == 0) {
3904 return bdi.can_write_zeroes_with_unmap;
3905 }
3906
3907 return false;
3908}
3909
b6b8a333 3910typedef struct BdrvCoGetBlockStatusData {
376ae3f1 3911 BlockDriverState *bs;
b35b2bba 3912 BlockDriverState *base;
376ae3f1
SH
3913 int64_t sector_num;
3914 int nb_sectors;
3915 int *pnum;
b6b8a333 3916 int64_t ret;
376ae3f1 3917 bool done;
b6b8a333 3918} BdrvCoGetBlockStatusData;
376ae3f1 3919
f58c7b35
TS
3920/*
3921 * Returns true iff the specified sector is present in the disk image. Drivers
3922 * not implementing the functionality are assumed to not support backing files,
3923 * hence all their sectors are reported as allocated.
3924 *
bd9533e3
SH
3925 * If 'sector_num' is beyond the end of the disk image the return value is 0
3926 * and 'pnum' is set to 0.
3927 *
f58c7b35
TS
3928 * 'pnum' is set to the number of sectors (including and immediately following
3929 * the specified sector) that are known to be in the same
3930 * allocated/unallocated state.
3931 *
bd9533e3
SH
3932 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
3933 * beyond the end of the disk image it will be clamped.
f58c7b35 3934 */
b6b8a333
PB
3935static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3936 int64_t sector_num,
3937 int nb_sectors, int *pnum)
f58c7b35 3938{
617ccb46 3939 int64_t length;
bd9533e3 3940 int64_t n;
5daa74a6 3941 int64_t ret, ret2;
bd9533e3 3942
617ccb46
PB
3943 length = bdrv_getlength(bs);
3944 if (length < 0) {
3945 return length;
3946 }
3947
3948 if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
bd9533e3
SH
3949 *pnum = 0;
3950 return 0;
3951 }
3952
3953 n = bs->total_sectors - sector_num;
3954 if (n < nb_sectors) {
3955 nb_sectors = n;
3956 }
3957
b6b8a333 3958 if (!bs->drv->bdrv_co_get_block_status) {
bd9533e3 3959 *pnum = nb_sectors;
e88ae226 3960 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
918e92d7
PB
3961 if (bs->drv->protocol_name) {
3962 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3963 }
3964 return ret;
f58c7b35 3965 }
6aebab14 3966
415b5b01
PB
3967 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3968 if (ret < 0) {
3e0a233d 3969 *pnum = 0;
415b5b01
PB
3970 return ret;
3971 }
3972
92bc50a5
PL
3973 if (ret & BDRV_BLOCK_RAW) {
3974 assert(ret & BDRV_BLOCK_OFFSET_VALID);
3975 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3976 *pnum, pnum);
3977 }
3978
e88ae226
KW
3979 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
3980 ret |= BDRV_BLOCK_ALLOCATED;
3981 }
3982
c3d86884
PL
3983 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3984 if (bdrv_unallocated_blocks_are_zero(bs)) {
f0ad5712 3985 ret |= BDRV_BLOCK_ZERO;
1f9db224 3986 } else if (bs->backing_hd) {
f0ad5712
PB
3987 BlockDriverState *bs2 = bs->backing_hd;
3988 int64_t length2 = bdrv_getlength(bs2);
3989 if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3990 ret |= BDRV_BLOCK_ZERO;
3991 }
3992 }
415b5b01 3993 }
5daa74a6
PB
3994
3995 if (bs->file &&
3996 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3997 (ret & BDRV_BLOCK_OFFSET_VALID)) {
3998 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3999 *pnum, pnum);
4000 if (ret2 >= 0) {
4001 /* Ignore errors. This is just providing extra information, it
4002 * is useful but not necessary.
4003 */
4004 ret |= (ret2 & BDRV_BLOCK_ZERO);
4005 }
4006 }
4007
415b5b01 4008 return ret;
060f51c9
SH
4009}
4010
b6b8a333
PB
4011/* Coroutine wrapper for bdrv_get_block_status() */
4012static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
060f51c9 4013{
b6b8a333 4014 BdrvCoGetBlockStatusData *data = opaque;
060f51c9
SH
4015 BlockDriverState *bs = data->bs;
4016
b6b8a333
PB
4017 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4018 data->pnum);
060f51c9
SH
4019 data->done = true;
4020}
4021
4022/*
b6b8a333 4023 * Synchronous wrapper around bdrv_co_get_block_status().
060f51c9 4024 *
b6b8a333 4025 * See bdrv_co_get_block_status() for details.
060f51c9 4026 */
b6b8a333
PB
4027int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4028 int nb_sectors, int *pnum)
060f51c9 4029{
6aebab14 4030 Coroutine *co;
b6b8a333 4031 BdrvCoGetBlockStatusData data = {
6aebab14
SH
4032 .bs = bs,
4033 .sector_num = sector_num,
4034 .nb_sectors = nb_sectors,
4035 .pnum = pnum,
4036 .done = false,
4037 };
4038
bdad13b9
PB
4039 if (qemu_in_coroutine()) {
4040 /* Fast-path if already in coroutine context */
b6b8a333 4041 bdrv_get_block_status_co_entry(&data);
bdad13b9 4042 } else {
2572b37a
SH
4043 AioContext *aio_context = bdrv_get_aio_context(bs);
4044
b6b8a333 4045 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
bdad13b9
PB
4046 qemu_coroutine_enter(co, &data);
4047 while (!data.done) {
2572b37a 4048 aio_poll(aio_context, true);
bdad13b9 4049 }
6aebab14
SH
4050 }
4051 return data.ret;
f58c7b35
TS
4052}
4053
b6b8a333
PB
4054int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4055 int nb_sectors, int *pnum)
4056{
4333bb71
PB
4057 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4058 if (ret < 0) {
4059 return ret;
4060 }
e88ae226 4061 return (ret & BDRV_BLOCK_ALLOCATED);
b6b8a333
PB
4062}
4063
188a7bbf
PB
4064/*
4065 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4066 *
4067 * Return true if the given sector is allocated in any image between
4068 * BASE and TOP (inclusive). BASE can be NULL to check if the given
4069 * sector is allocated in any image of the chain. Return false otherwise.
4070 *
4071 * 'pnum' is set to the number of sectors (including and immediately following
4072 * the specified sector) that are known to be in the same
4073 * allocated/unallocated state.
4074 *
4075 */
4f578637
PB
4076int bdrv_is_allocated_above(BlockDriverState *top,
4077 BlockDriverState *base,
4078 int64_t sector_num,
4079 int nb_sectors, int *pnum)
188a7bbf
PB
4080{
4081 BlockDriverState *intermediate;
4082 int ret, n = nb_sectors;
4083
4084 intermediate = top;
4085 while (intermediate && intermediate != base) {
4086 int pnum_inter;
bdad13b9
PB
4087 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4088 &pnum_inter);
188a7bbf
PB
4089 if (ret < 0) {
4090 return ret;
4091 } else if (ret) {
4092 *pnum = pnum_inter;
4093 return 1;
4094 }
4095
4096 /*
4097 * [sector_num, nb_sectors] is unallocated on top but intermediate
4098 * might have
4099 *
4100 * [sector_num+x, nr_sectors] allocated.
4101 */
63ba17d3
VI
4102 if (n > pnum_inter &&
4103 (intermediate == top ||
4104 sector_num + pnum_inter < intermediate->total_sectors)) {
188a7bbf
PB
4105 n = pnum_inter;
4106 }
4107
4108 intermediate = intermediate->backing_hd;
4109 }
4110
4111 *pnum = n;
4112 return 0;
4113}
4114
045df330
AL
4115const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4116{
4117 if (bs->backing_hd && bs->backing_hd->encrypted)
4118 return bs->backing_file;
4119 else if (bs->encrypted)
4120 return bs->filename;
4121 else
4122 return NULL;
4123}
4124
5fafdf24 4125void bdrv_get_backing_filename(BlockDriverState *bs,
83f64091
FB
4126 char *filename, int filename_size)
4127{
3574c608 4128 pstrcpy(filename, filename_size, bs->backing_file);
83f64091
FB
4129}
4130
5fafdf24 4131int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
faea38e7
FB
4132 const uint8_t *buf, int nb_sectors)
4133{
4134 BlockDriver *drv = bs->drv;
4135 if (!drv)
19cb3738 4136 return -ENOMEDIUM;
faea38e7
FB
4137 if (!drv->bdrv_write_compressed)
4138 return -ENOTSUP;
fbb7b4e0
KW
4139 if (bdrv_check_request(bs, sector_num, nb_sectors))
4140 return -EIO;
a55eb92c 4141
e4654d2d 4142 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
a55eb92c 4143
faea38e7
FB
4144 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4145}
3b46e624 4146
faea38e7
FB
4147int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4148{
4149 BlockDriver *drv = bs->drv;
4150 if (!drv)
19cb3738 4151 return -ENOMEDIUM;
faea38e7
FB
4152 if (!drv->bdrv_get_info)
4153 return -ENOTSUP;
4154 memset(bdi, 0, sizeof(*bdi));
4155 return drv->bdrv_get_info(bs, bdi);
4156}
4157
eae041fe
HR
4158ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4159{
4160 BlockDriver *drv = bs->drv;
4161 if (drv && drv->bdrv_get_specific_info) {
4162 return drv->bdrv_get_specific_info(bs);
4163 }
4164 return NULL;
4165}
4166
45566e9c
CH
4167int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4168 int64_t pos, int size)
cf8074b3
KW
4169{
4170 QEMUIOVector qiov;
4171 struct iovec iov = {
4172 .iov_base = (void *) buf,
4173 .iov_len = size,
4174 };
4175
4176 qemu_iovec_init_external(&qiov, &iov, 1);
4177 return bdrv_writev_vmstate(bs, &qiov, pos);
4178}
4179
4180int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
178e08a5
AL
4181{
4182 BlockDriver *drv = bs->drv;
cf8074b3
KW
4183
4184 if (!drv) {
178e08a5 4185 return -ENOMEDIUM;
cf8074b3
KW
4186 } else if (drv->bdrv_save_vmstate) {
4187 return drv->bdrv_save_vmstate(bs, qiov, pos);
4188 } else if (bs->file) {
4189 return bdrv_writev_vmstate(bs->file, qiov, pos);
4190 }
4191
7cdb1f6d 4192 return -ENOTSUP;
178e08a5
AL
4193}
4194
45566e9c
CH
4195int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4196 int64_t pos, int size)
178e08a5
AL
4197{
4198 BlockDriver *drv = bs->drv;
4199 if (!drv)
4200 return -ENOMEDIUM;
7cdb1f6d
MK
4201 if (drv->bdrv_load_vmstate)
4202 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4203 if (bs->file)
4204 return bdrv_load_vmstate(bs->file, buf, pos, size);
4205 return -ENOTSUP;
178e08a5
AL
4206}
4207
8b9b0cc2
KW
4208void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4209{
bf736fe3 4210 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
8b9b0cc2
KW
4211 return;
4212 }
4213
bf736fe3 4214 bs->drv->bdrv_debug_event(bs, event);
41c695c7
KW
4215}
4216
4217int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4218 const char *tag)
4219{
4220 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4221 bs = bs->file;
4222 }
4223
4224 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4225 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4226 }
4227
4228 return -ENOTSUP;
4229}
4230
4cc70e93
FZ
4231int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4232{
4233 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4234 bs = bs->file;
4235 }
4236
4237 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4238 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4239 }
4240
4241 return -ENOTSUP;
4242}
4243
41c695c7
KW
4244int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4245{
938789ea 4246 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
41c695c7
KW
4247 bs = bs->file;
4248 }
8b9b0cc2 4249
41c695c7
KW
4250 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4251 return bs->drv->bdrv_debug_resume(bs, tag);
4252 }
4253
4254 return -ENOTSUP;
4255}
4256
4257bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4258{
4259 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4260 bs = bs->file;
4261 }
4262
4263 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4264 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4265 }
4266
4267 return false;
8b9b0cc2
KW
4268}
4269
199630b6
BS
4270int bdrv_is_snapshot(BlockDriverState *bs)
4271{
4272 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4273}
4274
b1b1d783
JC
4275/* backing_file can either be relative, or absolute, or a protocol. If it is
4276 * relative, it must be relative to the chain. So, passing in bs->filename
4277 * from a BDS as backing_file should not be done, as that may be relative to
4278 * the CWD rather than the chain. */
e8a6bb9c
MT
4279BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4280 const char *backing_file)
4281{
b1b1d783
JC
4282 char *filename_full = NULL;
4283 char *backing_file_full = NULL;
4284 char *filename_tmp = NULL;
4285 int is_protocol = 0;
4286 BlockDriverState *curr_bs = NULL;
4287 BlockDriverState *retval = NULL;
4288
4289 if (!bs || !bs->drv || !backing_file) {
e8a6bb9c
MT
4290 return NULL;
4291 }
4292
b1b1d783
JC
4293 filename_full = g_malloc(PATH_MAX);
4294 backing_file_full = g_malloc(PATH_MAX);
4295 filename_tmp = g_malloc(PATH_MAX);
4296
4297 is_protocol = path_has_protocol(backing_file);
4298
4299 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4300
4301 /* If either of the filename paths is actually a protocol, then
4302 * compare unmodified paths; otherwise make paths relative */
4303 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4304 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4305 retval = curr_bs->backing_hd;
4306 break;
4307 }
e8a6bb9c 4308 } else {
b1b1d783
JC
4309 /* If not an absolute filename path, make it relative to the current
4310 * image's filename path */
4311 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4312 backing_file);
4313
4314 /* We are going to compare absolute pathnames */
4315 if (!realpath(filename_tmp, filename_full)) {
4316 continue;
4317 }
4318
4319 /* We need to make sure the backing filename we are comparing against
4320 * is relative to the current image filename (or absolute) */
4321 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4322 curr_bs->backing_file);
4323
4324 if (!realpath(filename_tmp, backing_file_full)) {
4325 continue;
4326 }
4327
4328 if (strcmp(backing_file_full, filename_full) == 0) {
4329 retval = curr_bs->backing_hd;
4330 break;
4331 }
e8a6bb9c
MT
4332 }
4333 }
4334
b1b1d783
JC
4335 g_free(filename_full);
4336 g_free(backing_file_full);
4337 g_free(filename_tmp);
4338 return retval;
e8a6bb9c
MT
4339}
4340
f198fd1c
BC
4341int bdrv_get_backing_file_depth(BlockDriverState *bs)
4342{
4343 if (!bs->drv) {
4344 return 0;
4345 }
4346
4347 if (!bs->backing_hd) {
4348 return 0;
4349 }
4350
4351 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4352}
4353
79fac568
JC
4354BlockDriverState *bdrv_find_base(BlockDriverState *bs)
4355{
4356 BlockDriverState *curr_bs = NULL;
4357
4358 if (!bs) {
4359 return NULL;
4360 }
4361
4362 curr_bs = bs;
4363
4364 while (curr_bs->backing_hd) {
4365 curr_bs = curr_bs->backing_hd;
4366 }
4367 return curr_bs;
4368}
4369
ea2384d3 4370/**************************************************************/
83f64091 4371/* async I/Os */
ea2384d3 4372
3b69e4b9 4373BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
f141eafe 4374 QEMUIOVector *qiov, int nb_sectors,
3b69e4b9 4375 BlockDriverCompletionFunc *cb, void *opaque)
83f64091 4376{
bbf0a440
SH
4377 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4378
d20d9b7c 4379 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
8c5873d6 4380 cb, opaque, false);
ea2384d3
FB
4381}
4382
f141eafe
AL
4383BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4384 QEMUIOVector *qiov, int nb_sectors,
4385 BlockDriverCompletionFunc *cb, void *opaque)
ea2384d3 4386{
bbf0a440
SH
4387 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4388
d20d9b7c 4389 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
8c5873d6 4390 cb, opaque, true);
83f64091
FB
4391}
4392
d5ef94d4
PB
4393BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4394 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4395 BlockDriverCompletionFunc *cb, void *opaque)
4396{
4397 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4398
4399 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4400 BDRV_REQ_ZERO_WRITE | flags,
4401 cb, opaque, true);
4402}
4403
40b4f539
KW
4404
4405typedef struct MultiwriteCB {
4406 int error;
4407 int num_requests;
4408 int num_callbacks;
4409 struct {
4410 BlockDriverCompletionFunc *cb;
4411 void *opaque;
4412 QEMUIOVector *free_qiov;
40b4f539
KW
4413 } callbacks[];
4414} MultiwriteCB;
4415
4416static void multiwrite_user_cb(MultiwriteCB *mcb)
4417{
4418 int i;
4419
4420 for (i = 0; i < mcb->num_callbacks; i++) {
4421 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
1e1ea48d
SH
4422 if (mcb->callbacks[i].free_qiov) {
4423 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4424 }
7267c094 4425 g_free(mcb->callbacks[i].free_qiov);
40b4f539
KW
4426 }
4427}
4428
4429static void multiwrite_cb(void *opaque, int ret)
4430{
4431 MultiwriteCB *mcb = opaque;
4432
6d519a5f
SH
4433 trace_multiwrite_cb(mcb, ret);
4434
cb6d3ca0 4435 if (ret < 0 && !mcb->error) {
40b4f539 4436 mcb->error = ret;
40b4f539
KW
4437 }
4438
4439 mcb->num_requests--;
4440 if (mcb->num_requests == 0) {
de189a1b 4441 multiwrite_user_cb(mcb);
7267c094 4442 g_free(mcb);
40b4f539
KW
4443 }
4444}
4445
4446static int multiwrite_req_compare(const void *a, const void *b)
4447{
77be4366
CH
4448 const BlockRequest *req1 = a, *req2 = b;
4449
4450 /*
4451 * Note that we can't simply subtract req2->sector from req1->sector
4452 * here as that could overflow the return value.
4453 */
4454 if (req1->sector > req2->sector) {
4455 return 1;
4456 } else if (req1->sector < req2->sector) {
4457 return -1;
4458 } else {
4459 return 0;
4460 }
40b4f539
KW
4461}
4462
4463/*
4464 * Takes a bunch of requests and tries to merge them. Returns the number of
4465 * requests that remain after merging.
4466 */
4467static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4468 int num_reqs, MultiwriteCB *mcb)
4469{
4470 int i, outidx;
4471
4472 // Sort requests by start sector
4473 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4474
4475 // Check if adjacent requests touch the same clusters. If so, combine them,
4476 // filling up gaps with zero sectors.
4477 outidx = 0;
4478 for (i = 1; i < num_reqs; i++) {
4479 int merge = 0;
4480 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4481
b6a127a1 4482 // Handle exactly sequential writes and overlapping writes.
40b4f539
KW
4483 if (reqs[i].sector <= oldreq_last) {
4484 merge = 1;
4485 }
4486
e2a305fb
CH
4487 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4488 merge = 0;
4489 }
4490
40b4f539
KW
4491 if (merge) {
4492 size_t size;
7267c094 4493 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
40b4f539
KW
4494 qemu_iovec_init(qiov,
4495 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4496
4497 // Add the first request to the merged one. If the requests are
4498 // overlapping, drop the last sectors of the first request.
4499 size = (reqs[i].sector - reqs[outidx].sector) << 9;
1b093c48 4500 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
40b4f539 4501
b6a127a1
PB
4502 // We should need to add any zeros between the two requests
4503 assert (reqs[i].sector <= oldreq_last);
40b4f539
KW
4504
4505 // Add the second request
1b093c48 4506 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
40b4f539 4507
cbf1dff2 4508 reqs[outidx].nb_sectors = qiov->size >> 9;
40b4f539
KW
4509 reqs[outidx].qiov = qiov;
4510
4511 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4512 } else {
4513 outidx++;
4514 reqs[outidx].sector = reqs[i].sector;
4515 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4516 reqs[outidx].qiov = reqs[i].qiov;
4517 }
4518 }
4519
4520 return outidx + 1;
4521}
4522
4523/*
4524 * Submit multiple AIO write requests at once.
4525 *
4526 * On success, the function returns 0 and all requests in the reqs array have
4527 * been submitted. In error case this function returns -1, and any of the
4528 * requests may or may not be submitted yet. In particular, this means that the
4529 * callback will be called for some of the requests, for others it won't. The
4530 * caller must check the error field of the BlockRequest to wait for the right
4531 * callbacks (if error != 0, no callback will be called).
4532 *
4533 * The implementation may modify the contents of the reqs array, e.g. to merge
4534 * requests. However, the fields opaque and error are left unmodified as they
4535 * are used to signal failure for a single request to the caller.
4536 */
4537int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4538{
40b4f539
KW
4539 MultiwriteCB *mcb;
4540 int i;
4541
301db7c2
RH
4542 /* don't submit writes if we don't have a medium */
4543 if (bs->drv == NULL) {
4544 for (i = 0; i < num_reqs; i++) {
4545 reqs[i].error = -ENOMEDIUM;
4546 }
4547 return -1;
4548 }
4549
40b4f539
KW
4550 if (num_reqs == 0) {
4551 return 0;
4552 }
4553
4554 // Create MultiwriteCB structure
7267c094 4555 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
40b4f539
KW
4556 mcb->num_requests = 0;
4557 mcb->num_callbacks = num_reqs;
4558
4559 for (i = 0; i < num_reqs; i++) {
4560 mcb->callbacks[i].cb = reqs[i].cb;
4561 mcb->callbacks[i].opaque = reqs[i].opaque;
4562 }
4563
4564 // Check for mergable requests
4565 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4566
6d519a5f
SH
4567 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4568
df9309fb
PB
4569 /* Run the aio requests. */
4570 mcb->num_requests = num_reqs;
40b4f539 4571 for (i = 0; i < num_reqs; i++) {
d20d9b7c
PB
4572 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4573 reqs[i].nb_sectors, reqs[i].flags,
4574 multiwrite_cb, mcb,
4575 true);
40b4f539
KW
4576 }
4577
4578 return 0;
40b4f539
KW
4579}
4580
83f64091 4581void bdrv_aio_cancel(BlockDriverAIOCB *acb)
83f64091 4582{
d7331bed 4583 acb->aiocb_info->cancel(acb);
83f64091
FB
4584}
4585
4586/**************************************************************/
4587/* async block device emulation */
4588
c16b5a2c
CH
4589typedef struct BlockDriverAIOCBSync {
4590 BlockDriverAIOCB common;
4591 QEMUBH *bh;
4592 int ret;
4593 /* vector translation state */
4594 QEMUIOVector *qiov;
4595 uint8_t *bounce;
4596 int is_write;
4597} BlockDriverAIOCBSync;
4598
4599static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4600{
b666d239
KW
4601 BlockDriverAIOCBSync *acb =
4602 container_of(blockacb, BlockDriverAIOCBSync, common);
6a7ad299 4603 qemu_bh_delete(acb->bh);
36afc451 4604 acb->bh = NULL;
c16b5a2c
CH
4605 qemu_aio_release(acb);
4606}
4607
d7331bed 4608static const AIOCBInfo bdrv_em_aiocb_info = {
c16b5a2c
CH
4609 .aiocb_size = sizeof(BlockDriverAIOCBSync),
4610 .cancel = bdrv_aio_cancel_em,
4611};
4612
ce1a14dc 4613static void bdrv_aio_bh_cb(void *opaque)
83f64091 4614{
ce1a14dc 4615 BlockDriverAIOCBSync *acb = opaque;
f141eafe 4616
f141eafe 4617 if (!acb->is_write)
03396148 4618 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
ceb42de8 4619 qemu_vfree(acb->bounce);
ce1a14dc 4620 acb->common.cb(acb->common.opaque, acb->ret);
6a7ad299 4621 qemu_bh_delete(acb->bh);
36afc451 4622 acb->bh = NULL;
ce1a14dc 4623 qemu_aio_release(acb);
83f64091 4624}
beac80cd 4625
f141eafe
AL
4626static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4627 int64_t sector_num,
4628 QEMUIOVector *qiov,
4629 int nb_sectors,
4630 BlockDriverCompletionFunc *cb,
4631 void *opaque,
4632 int is_write)
4633
83f64091 4634{
ce1a14dc 4635 BlockDriverAIOCBSync *acb;
ce1a14dc 4636
d7331bed 4637 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
f141eafe
AL
4638 acb->is_write = is_write;
4639 acb->qiov = qiov;
e268ca52 4640 acb->bounce = qemu_blockalign(bs, qiov->size);
2572b37a 4641 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
f141eafe
AL
4642
4643 if (is_write) {
d5e6b161 4644 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
1ed20acf 4645 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
f141eafe 4646 } else {
1ed20acf 4647 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
f141eafe
AL
4648 }
4649
ce1a14dc 4650 qemu_bh_schedule(acb->bh);
f141eafe 4651
ce1a14dc 4652 return &acb->common;
beac80cd
FB
4653}
4654
f141eafe
AL
4655static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4656 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 4657 BlockDriverCompletionFunc *cb, void *opaque)
beac80cd 4658{
f141eafe
AL
4659 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4660}
83f64091 4661
f141eafe
AL
4662static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4663 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4664 BlockDriverCompletionFunc *cb, void *opaque)
4665{
4666 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
beac80cd 4667}
beac80cd 4668
68485420
KW
4669
4670typedef struct BlockDriverAIOCBCoroutine {
4671 BlockDriverAIOCB common;
4672 BlockRequest req;
4673 bool is_write;
d318aea9 4674 bool *done;
68485420
KW
4675 QEMUBH* bh;
4676} BlockDriverAIOCBCoroutine;
4677
4678static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4679{
2572b37a 4680 AioContext *aio_context = bdrv_get_aio_context(blockacb->bs);
d318aea9
KW
4681 BlockDriverAIOCBCoroutine *acb =
4682 container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4683 bool done = false;
4684
4685 acb->done = &done;
4686 while (!done) {
2572b37a 4687 aio_poll(aio_context, true);
d318aea9 4688 }
68485420
KW
4689}
4690
d7331bed 4691static const AIOCBInfo bdrv_em_co_aiocb_info = {
68485420
KW
4692 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
4693 .cancel = bdrv_aio_co_cancel_em,
4694};
4695
35246a68 4696static void bdrv_co_em_bh(void *opaque)
68485420
KW
4697{
4698 BlockDriverAIOCBCoroutine *acb = opaque;
4699
4700 acb->common.cb(acb->common.opaque, acb->req.error);
d318aea9
KW
4701
4702 if (acb->done) {
4703 *acb->done = true;
4704 }
4705
68485420
KW
4706 qemu_bh_delete(acb->bh);
4707 qemu_aio_release(acb);
4708}
4709
b2a61371
SH
4710/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4711static void coroutine_fn bdrv_co_do_rw(void *opaque)
4712{
4713 BlockDriverAIOCBCoroutine *acb = opaque;
4714 BlockDriverState *bs = acb->common.bs;
4715
4716 if (!acb->is_write) {
4717 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
d20d9b7c 4718 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
b2a61371
SH
4719 } else {
4720 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
d20d9b7c 4721 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
b2a61371
SH
4722 }
4723
2572b37a 4724 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
b2a61371
SH
4725 qemu_bh_schedule(acb->bh);
4726}
4727
68485420
KW
4728static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4729 int64_t sector_num,
4730 QEMUIOVector *qiov,
4731 int nb_sectors,
d20d9b7c 4732 BdrvRequestFlags flags,
68485420
KW
4733 BlockDriverCompletionFunc *cb,
4734 void *opaque,
8c5873d6 4735 bool is_write)
68485420
KW
4736{
4737 Coroutine *co;
4738 BlockDriverAIOCBCoroutine *acb;
4739
d7331bed 4740 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
68485420
KW
4741 acb->req.sector = sector_num;
4742 acb->req.nb_sectors = nb_sectors;
4743 acb->req.qiov = qiov;
d20d9b7c 4744 acb->req.flags = flags;
68485420 4745 acb->is_write = is_write;
d318aea9 4746 acb->done = NULL;
68485420 4747
8c5873d6 4748 co = qemu_coroutine_create(bdrv_co_do_rw);
68485420
KW
4749 qemu_coroutine_enter(co, acb);
4750
4751 return &acb->common;
4752}
4753
07f07615 4754static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
b2e12bc6 4755{
07f07615
PB
4756 BlockDriverAIOCBCoroutine *acb = opaque;
4757 BlockDriverState *bs = acb->common.bs;
b2e12bc6 4758
07f07615 4759 acb->req.error = bdrv_co_flush(bs);
2572b37a 4760 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
b2e12bc6 4761 qemu_bh_schedule(acb->bh);
b2e12bc6
CH
4762}
4763
07f07615 4764BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
016f5cf6
AG
4765 BlockDriverCompletionFunc *cb, void *opaque)
4766{
07f07615 4767 trace_bdrv_aio_flush(bs, opaque);
016f5cf6 4768
07f07615
PB
4769 Coroutine *co;
4770 BlockDriverAIOCBCoroutine *acb;
016f5cf6 4771
d7331bed 4772 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
d318aea9
KW
4773 acb->done = NULL;
4774
07f07615
PB
4775 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4776 qemu_coroutine_enter(co, acb);
016f5cf6 4777
016f5cf6
AG
4778 return &acb->common;
4779}
4780
4265d620
PB
4781static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4782{
4783 BlockDriverAIOCBCoroutine *acb = opaque;
4784 BlockDriverState *bs = acb->common.bs;
4785
4786 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
2572b37a 4787 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4265d620
PB
4788 qemu_bh_schedule(acb->bh);
4789}
4790
4791BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4792 int64_t sector_num, int nb_sectors,
4793 BlockDriverCompletionFunc *cb, void *opaque)
4794{
4795 Coroutine *co;
4796 BlockDriverAIOCBCoroutine *acb;
4797
4798 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4799
d7331bed 4800 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4265d620
PB
4801 acb->req.sector = sector_num;
4802 acb->req.nb_sectors = nb_sectors;
d318aea9 4803 acb->done = NULL;
4265d620
PB
4804 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4805 qemu_coroutine_enter(co, acb);
4806
4807 return &acb->common;
4808}
4809
ea2384d3
FB
4810void bdrv_init(void)
4811{
5efa9d5a 4812 module_call_init(MODULE_INIT_BLOCK);
ea2384d3 4813}
ce1a14dc 4814
eb852011
MA
4815void bdrv_init_with_whitelist(void)
4816{
4817 use_bdrv_whitelist = 1;
4818 bdrv_init();
4819}
4820
d7331bed 4821void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
c16b5a2c 4822 BlockDriverCompletionFunc *cb, void *opaque)
ce1a14dc 4823{
ce1a14dc
PB
4824 BlockDriverAIOCB *acb;
4825
d7331bed
SH
4826 acb = g_slice_alloc(aiocb_info->aiocb_size);
4827 acb->aiocb_info = aiocb_info;
ce1a14dc
PB
4828 acb->bs = bs;
4829 acb->cb = cb;
4830 acb->opaque = opaque;
4831 return acb;
4832}
4833
4834void qemu_aio_release(void *p)
4835{
d37c975f 4836 BlockDriverAIOCB *acb = p;
d7331bed 4837 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
ce1a14dc 4838}
19cb3738 4839
f9f05dc5
KW
4840/**************************************************************/
4841/* Coroutine block device emulation */
4842
4843typedef struct CoroutineIOCompletion {
4844 Coroutine *coroutine;
4845 int ret;
4846} CoroutineIOCompletion;
4847
4848static void bdrv_co_io_em_complete(void *opaque, int ret)
4849{
4850 CoroutineIOCompletion *co = opaque;
4851
4852 co->ret = ret;
4853 qemu_coroutine_enter(co->coroutine, NULL);
4854}
4855
4856static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4857 int nb_sectors, QEMUIOVector *iov,
4858 bool is_write)
4859{
4860 CoroutineIOCompletion co = {
4861 .coroutine = qemu_coroutine_self(),
4862 };
4863 BlockDriverAIOCB *acb;
4864
4865 if (is_write) {
a652d160
SH
4866 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4867 bdrv_co_io_em_complete, &co);
f9f05dc5 4868 } else {
a652d160
SH
4869 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4870 bdrv_co_io_em_complete, &co);
f9f05dc5
KW
4871 }
4872
59370aaa 4873 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
f9f05dc5
KW
4874 if (!acb) {
4875 return -EIO;
4876 }
4877 qemu_coroutine_yield();
4878
4879 return co.ret;
4880}
4881
4882static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4883 int64_t sector_num, int nb_sectors,
4884 QEMUIOVector *iov)
4885{
4886 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4887}
4888
4889static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4890 int64_t sector_num, int nb_sectors,
4891 QEMUIOVector *iov)
4892{
4893 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4894}
4895
07f07615 4896static void coroutine_fn bdrv_flush_co_entry(void *opaque)
e7a8a783 4897{
07f07615
PB
4898 RwCo *rwco = opaque;
4899
4900 rwco->ret = bdrv_co_flush(rwco->bs);
4901}
4902
4903int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4904{
eb489bb1
KW
4905 int ret;
4906
29cdb251 4907 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
07f07615 4908 return 0;
eb489bb1
KW
4909 }
4910
ca716364 4911 /* Write back cached data to the OS even with cache=unsafe */
bf736fe3 4912 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
eb489bb1
KW
4913 if (bs->drv->bdrv_co_flush_to_os) {
4914 ret = bs->drv->bdrv_co_flush_to_os(bs);
4915 if (ret < 0) {
4916 return ret;
4917 }
4918 }
4919
ca716364
KW
4920 /* But don't actually force it to the disk with cache=unsafe */
4921 if (bs->open_flags & BDRV_O_NO_FLUSH) {
d4c82329 4922 goto flush_parent;
ca716364
KW
4923 }
4924
bf736fe3 4925 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
eb489bb1 4926 if (bs->drv->bdrv_co_flush_to_disk) {
29cdb251 4927 ret = bs->drv->bdrv_co_flush_to_disk(bs);
07f07615
PB
4928 } else if (bs->drv->bdrv_aio_flush) {
4929 BlockDriverAIOCB *acb;
4930 CoroutineIOCompletion co = {
4931 .coroutine = qemu_coroutine_self(),
4932 };
4933
4934 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4935 if (acb == NULL) {
29cdb251 4936 ret = -EIO;
07f07615
PB
4937 } else {
4938 qemu_coroutine_yield();
29cdb251 4939 ret = co.ret;
07f07615 4940 }
07f07615
PB
4941 } else {
4942 /*
4943 * Some block drivers always operate in either writethrough or unsafe
4944 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4945 * know how the server works (because the behaviour is hardcoded or
4946 * depends on server-side configuration), so we can't ensure that
4947 * everything is safe on disk. Returning an error doesn't work because
4948 * that would break guests even if the server operates in writethrough
4949 * mode.
4950 *
4951 * Let's hope the user knows what he's doing.
4952 */
29cdb251 4953 ret = 0;
07f07615 4954 }
29cdb251
PB
4955 if (ret < 0) {
4956 return ret;
4957 }
4958
4959 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
4960 * in the case of cache=unsafe, so there are no useless flushes.
4961 */
d4c82329 4962flush_parent:
29cdb251 4963 return bdrv_co_flush(bs->file);
07f07615
PB
4964}
4965
5a8a30db 4966void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
0f15423c 4967{
5a8a30db
KW
4968 Error *local_err = NULL;
4969 int ret;
4970
3456a8d1
KW
4971 if (!bs->drv) {
4972 return;
4973 }
4974
4975 if (bs->drv->bdrv_invalidate_cache) {
5a8a30db 4976 bs->drv->bdrv_invalidate_cache(bs, &local_err);
3456a8d1 4977 } else if (bs->file) {
5a8a30db
KW
4978 bdrv_invalidate_cache(bs->file, &local_err);
4979 }
4980 if (local_err) {
4981 error_propagate(errp, local_err);
4982 return;
0f15423c 4983 }
3456a8d1 4984
5a8a30db
KW
4985 ret = refresh_total_sectors(bs, bs->total_sectors);
4986 if (ret < 0) {
4987 error_setg_errno(errp, -ret, "Could not refresh total sector count");
4988 return;
4989 }
0f15423c
AL
4990}
4991
5a8a30db 4992void bdrv_invalidate_cache_all(Error **errp)
0f15423c
AL
4993{
4994 BlockDriverState *bs;
5a8a30db 4995 Error *local_err = NULL;
0f15423c 4996
dc364f4c 4997 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
4998 AioContext *aio_context = bdrv_get_aio_context(bs);
4999
5000 aio_context_acquire(aio_context);
5a8a30db 5001 bdrv_invalidate_cache(bs, &local_err);
ed78cda3 5002 aio_context_release(aio_context);
5a8a30db
KW
5003 if (local_err) {
5004 error_propagate(errp, local_err);
5005 return;
5006 }
0f15423c
AL
5007 }
5008}
5009
07789269
BC
5010void bdrv_clear_incoming_migration_all(void)
5011{
5012 BlockDriverState *bs;
5013
dc364f4c 5014 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
5015 AioContext *aio_context = bdrv_get_aio_context(bs);
5016
5017 aio_context_acquire(aio_context);
07789269 5018 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
ed78cda3 5019 aio_context_release(aio_context);
07789269
BC
5020 }
5021}
5022
07f07615
PB
5023int bdrv_flush(BlockDriverState *bs)
5024{
5025 Coroutine *co;
5026 RwCo rwco = {
5027 .bs = bs,
5028 .ret = NOT_DONE,
e7a8a783 5029 };
e7a8a783 5030
07f07615
PB
5031 if (qemu_in_coroutine()) {
5032 /* Fast-path if already in coroutine context */
5033 bdrv_flush_co_entry(&rwco);
5034 } else {
2572b37a
SH
5035 AioContext *aio_context = bdrv_get_aio_context(bs);
5036
07f07615
PB
5037 co = qemu_coroutine_create(bdrv_flush_co_entry);
5038 qemu_coroutine_enter(co, &rwco);
5039 while (rwco.ret == NOT_DONE) {
2572b37a 5040 aio_poll(aio_context, true);
07f07615 5041 }
e7a8a783 5042 }
07f07615
PB
5043
5044 return rwco.ret;
e7a8a783
KW
5045}
5046
775aa8b6
KW
5047typedef struct DiscardCo {
5048 BlockDriverState *bs;
5049 int64_t sector_num;
5050 int nb_sectors;
5051 int ret;
5052} DiscardCo;
4265d620
PB
5053static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5054{
775aa8b6 5055 DiscardCo *rwco = opaque;
4265d620
PB
5056
5057 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5058}
5059
6f14da52
PL
5060/* if no limit is specified in the BlockLimits use a default
5061 * of 32768 512-byte sectors (16 MiB) per request.
5062 */
5063#define MAX_DISCARD_DEFAULT 32768
5064
4265d620
PB
5065int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5066 int nb_sectors)
5067{
d51e9fe5
PB
5068 int max_discard;
5069
4265d620
PB
5070 if (!bs->drv) {
5071 return -ENOMEDIUM;
5072 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5073 return -EIO;
5074 } else if (bs->read_only) {
5075 return -EROFS;
df702c9b
PB
5076 }
5077
e4654d2d 5078 bdrv_reset_dirty(bs, sector_num, nb_sectors);
df702c9b 5079
9e8f1835
PB
5080 /* Do nothing if disabled. */
5081 if (!(bs->open_flags & BDRV_O_UNMAP)) {
5082 return 0;
5083 }
5084
d51e9fe5
PB
5085 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5086 return 0;
5087 }
6f14da52 5088
d51e9fe5
PB
5089 max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5090 while (nb_sectors > 0) {
5091 int ret;
5092 int num = nb_sectors;
6f14da52 5093
d51e9fe5
PB
5094 /* align request */
5095 if (bs->bl.discard_alignment &&
5096 num >= bs->bl.discard_alignment &&
5097 sector_num % bs->bl.discard_alignment) {
5098 if (num > bs->bl.discard_alignment) {
5099 num = bs->bl.discard_alignment;
6f14da52 5100 }
d51e9fe5
PB
5101 num -= sector_num % bs->bl.discard_alignment;
5102 }
6f14da52 5103
d51e9fe5
PB
5104 /* limit request size */
5105 if (num > max_discard) {
5106 num = max_discard;
5107 }
6f14da52 5108
d51e9fe5 5109 if (bs->drv->bdrv_co_discard) {
6f14da52 5110 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
d51e9fe5
PB
5111 } else {
5112 BlockDriverAIOCB *acb;
5113 CoroutineIOCompletion co = {
5114 .coroutine = qemu_coroutine_self(),
5115 };
5116
5117 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5118 bdrv_co_io_em_complete, &co);
5119 if (acb == NULL) {
5120 return -EIO;
5121 } else {
5122 qemu_coroutine_yield();
5123 ret = co.ret;
6f14da52 5124 }
6f14da52 5125 }
7ce21016 5126 if (ret && ret != -ENOTSUP) {
d51e9fe5 5127 return ret;
4265d620 5128 }
d51e9fe5
PB
5129
5130 sector_num += num;
5131 nb_sectors -= num;
4265d620 5132 }
d51e9fe5 5133 return 0;
4265d620
PB
5134}
5135
5136int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5137{
5138 Coroutine *co;
775aa8b6 5139 DiscardCo rwco = {
4265d620
PB
5140 .bs = bs,
5141 .sector_num = sector_num,
5142 .nb_sectors = nb_sectors,
5143 .ret = NOT_DONE,
5144 };
5145
5146 if (qemu_in_coroutine()) {
5147 /* Fast-path if already in coroutine context */
5148 bdrv_discard_co_entry(&rwco);
5149 } else {
2572b37a
SH
5150 AioContext *aio_context = bdrv_get_aio_context(bs);
5151
4265d620
PB
5152 co = qemu_coroutine_create(bdrv_discard_co_entry);
5153 qemu_coroutine_enter(co, &rwco);
5154 while (rwco.ret == NOT_DONE) {
2572b37a 5155 aio_poll(aio_context, true);
4265d620
PB
5156 }
5157 }
5158
5159 return rwco.ret;
5160}
5161
19cb3738
FB
5162/**************************************************************/
5163/* removable device support */
5164
5165/**
5166 * Return TRUE if the media is present
5167 */
5168int bdrv_is_inserted(BlockDriverState *bs)
5169{
5170 BlockDriver *drv = bs->drv;
a1aff5bf 5171
19cb3738
FB
5172 if (!drv)
5173 return 0;
5174 if (!drv->bdrv_is_inserted)
a1aff5bf
MA
5175 return 1;
5176 return drv->bdrv_is_inserted(bs);
19cb3738
FB
5177}
5178
5179/**
8e49ca46
MA
5180 * Return whether the media changed since the last call to this
5181 * function, or -ENOTSUP if we don't know. Most drivers don't know.
19cb3738
FB
5182 */
5183int bdrv_media_changed(BlockDriverState *bs)
5184{
5185 BlockDriver *drv = bs->drv;
19cb3738 5186
8e49ca46
MA
5187 if (drv && drv->bdrv_media_changed) {
5188 return drv->bdrv_media_changed(bs);
5189 }
5190 return -ENOTSUP;
19cb3738
FB
5191}
5192
5193/**
5194 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5195 */
f36f3949 5196void bdrv_eject(BlockDriverState *bs, bool eject_flag)
19cb3738
FB
5197{
5198 BlockDriver *drv = bs->drv;
19cb3738 5199
822e1cd1
MA
5200 if (drv && drv->bdrv_eject) {
5201 drv->bdrv_eject(bs, eject_flag);
19cb3738 5202 }
6f382ed2
LC
5203
5204 if (bs->device_name[0] != '\0') {
5205 bdrv_emit_qmp_eject_event(bs, eject_flag);
5206 }
19cb3738
FB
5207}
5208
19cb3738
FB
5209/**
5210 * Lock or unlock the media (if it is locked, the user won't be able
5211 * to eject it manually).
5212 */
025e849a 5213void bdrv_lock_medium(BlockDriverState *bs, bool locked)
19cb3738
FB
5214{
5215 BlockDriver *drv = bs->drv;
5216
025e849a 5217 trace_bdrv_lock_medium(bs, locked);
b8c6d095 5218
025e849a
MA
5219 if (drv && drv->bdrv_lock_medium) {
5220 drv->bdrv_lock_medium(bs, locked);
19cb3738
FB
5221 }
5222}
985a03b0
TS
5223
5224/* needed for generic scsi interface */
5225
5226int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5227{
5228 BlockDriver *drv = bs->drv;
5229
5230 if (drv && drv->bdrv_ioctl)
5231 return drv->bdrv_ioctl(bs, req, buf);
5232 return -ENOTSUP;
5233}
7d780669 5234
221f715d
AL
5235BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5236 unsigned long int req, void *buf,
5237 BlockDriverCompletionFunc *cb, void *opaque)
7d780669 5238{
221f715d 5239 BlockDriver *drv = bs->drv;
7d780669 5240
221f715d
AL
5241 if (drv && drv->bdrv_aio_ioctl)
5242 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5243 return NULL;
7d780669 5244}
e268ca52 5245
1b7fd729 5246void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
7b6f9300 5247{
1b7fd729 5248 bs->guest_block_size = align;
7b6f9300 5249}
7cd1e32a 5250
e268ca52
AL
5251void *qemu_blockalign(BlockDriverState *bs, size_t size)
5252{
339064d5 5253 return qemu_memalign(bdrv_opt_mem_align(bs), size);
e268ca52 5254}
7cd1e32a 5255
c53b1c51
SH
5256/*
5257 * Check if all memory in this vector is sector aligned.
5258 */
5259bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5260{
5261 int i;
339064d5 5262 size_t alignment = bdrv_opt_mem_align(bs);
c53b1c51
SH
5263
5264 for (i = 0; i < qiov->niov; i++) {
339064d5 5265 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
c53b1c51 5266 return false;
1ff735bd 5267 }
339064d5 5268 if (qiov->iov[i].iov_len % alignment) {
1ff735bd 5269 return false;
c53b1c51
SH
5270 }
5271 }
5272
5273 return true;
5274}
5275
b8afb520
FZ
5276BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5277 Error **errp)
7cd1e32a
LS
5278{
5279 int64_t bitmap_size;
e4654d2d 5280 BdrvDirtyBitmap *bitmap;
a55eb92c 5281
50717e94
PB
5282 assert((granularity & (granularity - 1)) == 0);
5283
e4654d2d
FZ
5284 granularity >>= BDRV_SECTOR_BITS;
5285 assert(granularity);
b8afb520
FZ
5286 bitmap_size = bdrv_getlength(bs);
5287 if (bitmap_size < 0) {
5288 error_setg_errno(errp, -bitmap_size, "could not get length of device");
5289 errno = -bitmap_size;
5290 return NULL;
5291 }
5292 bitmap_size >>= BDRV_SECTOR_BITS;
e4654d2d
FZ
5293 bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
5294 bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5295 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5296 return bitmap;
5297}
5298
5299void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5300{
5301 BdrvDirtyBitmap *bm, *next;
5302 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5303 if (bm == bitmap) {
5304 QLIST_REMOVE(bitmap, list);
5305 hbitmap_free(bitmap->bitmap);
5306 g_free(bitmap);
5307 return;
a55eb92c 5308 }
7cd1e32a
LS
5309 }
5310}
5311
21b56835
FZ
5312BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5313{
5314 BdrvDirtyBitmap *bm;
5315 BlockDirtyInfoList *list = NULL;
5316 BlockDirtyInfoList **plist = &list;
5317
5318 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5319 BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
5320 BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
5321 info->count = bdrv_get_dirty_count(bs, bm);
5322 info->granularity =
5323 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5324 entry->value = info;
5325 *plist = entry;
5326 plist = &entry->next;
5327 }
5328
5329 return list;
5330}
5331
e4654d2d 5332int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
7cd1e32a 5333{
e4654d2d
FZ
5334 if (bitmap) {
5335 return hbitmap_get(bitmap->bitmap, sector);
7cd1e32a
LS
5336 } else {
5337 return 0;
5338 }
5339}
5340
e4654d2d
FZ
5341void bdrv_dirty_iter_init(BlockDriverState *bs,
5342 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
1755da16 5343{
e4654d2d 5344 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
1755da16
PB
5345}
5346
5347void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5348 int nr_sectors)
5349{
e4654d2d
FZ
5350 BdrvDirtyBitmap *bitmap;
5351 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5352 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5353 }
1755da16
PB
5354}
5355
e4654d2d 5356void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
7cd1e32a 5357{
e4654d2d
FZ
5358 BdrvDirtyBitmap *bitmap;
5359 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5360 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5361 }
7cd1e32a 5362}
aaa0eb75 5363
e4654d2d 5364int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
aaa0eb75 5365{
e4654d2d 5366 return hbitmap_count(bitmap->bitmap);
aaa0eb75 5367}
f88e1a42 5368
9fcb0251
FZ
5369/* Get a reference to bs */
5370void bdrv_ref(BlockDriverState *bs)
5371{
5372 bs->refcnt++;
5373}
5374
5375/* Release a previously grabbed reference to bs.
5376 * If after releasing, reference count is zero, the BlockDriverState is
5377 * deleted. */
5378void bdrv_unref(BlockDriverState *bs)
5379{
5380 assert(bs->refcnt > 0);
5381 if (--bs->refcnt == 0) {
5382 bdrv_delete(bs);
5383 }
5384}
5385
fbe40ff7
FZ
5386struct BdrvOpBlocker {
5387 Error *reason;
5388 QLIST_ENTRY(BdrvOpBlocker) list;
5389};
5390
5391bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5392{
5393 BdrvOpBlocker *blocker;
5394 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5395 if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5396 blocker = QLIST_FIRST(&bs->op_blockers[op]);
5397 if (errp) {
5398 error_setg(errp, "Device '%s' is busy: %s",
5399 bs->device_name, error_get_pretty(blocker->reason));
5400 }
5401 return true;
5402 }
5403 return false;
5404}
5405
5406void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5407{
5408 BdrvOpBlocker *blocker;
5409 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5410
5411 blocker = g_malloc0(sizeof(BdrvOpBlocker));
5412 blocker->reason = reason;
5413 QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5414}
5415
5416void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5417{
5418 BdrvOpBlocker *blocker, *next;
5419 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5420 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5421 if (blocker->reason == reason) {
5422 QLIST_REMOVE(blocker, list);
5423 g_free(blocker);
5424 }
5425 }
5426}
5427
5428void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5429{
5430 int i;
5431 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5432 bdrv_op_block(bs, i, reason);
5433 }
5434}
5435
5436void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5437{
5438 int i;
5439 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5440 bdrv_op_unblock(bs, i, reason);
5441 }
5442}
5443
5444bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5445{
5446 int i;
5447
5448 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5449 if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5450 return false;
5451 }
5452 }
5453 return true;
5454}
5455
28a7282a
LC
5456void bdrv_iostatus_enable(BlockDriverState *bs)
5457{
d6bf279e 5458 bs->iostatus_enabled = true;
58e21ef5 5459 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
28a7282a
LC
5460}
5461
5462/* The I/O status is only enabled if the drive explicitly
5463 * enables it _and_ the VM is configured to stop on errors */
5464bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5465{
d6bf279e 5466 return (bs->iostatus_enabled &&
92aa5c6d
PB
5467 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5468 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5469 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
28a7282a
LC
5470}
5471
5472void bdrv_iostatus_disable(BlockDriverState *bs)
5473{
d6bf279e 5474 bs->iostatus_enabled = false;
28a7282a
LC
5475}
5476
5477void bdrv_iostatus_reset(BlockDriverState *bs)
5478{
5479 if (bdrv_iostatus_is_enabled(bs)) {
58e21ef5 5480 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3bd293c3
PB
5481 if (bs->job) {
5482 block_job_iostatus_reset(bs->job);
5483 }
28a7282a
LC
5484 }
5485}
5486
28a7282a
LC
5487void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5488{
3e1caa5f
PB
5489 assert(bdrv_iostatus_is_enabled(bs));
5490 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
58e21ef5
LC
5491 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5492 BLOCK_DEVICE_IO_STATUS_FAILED;
28a7282a
LC
5493 }
5494}
5495
a597e79c
CH
5496void
5497bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5498 enum BlockAcctType type)
5499{
5500 assert(type < BDRV_MAX_IOTYPE);
5501
5502 cookie->bytes = bytes;
c488c7f6 5503 cookie->start_time_ns = get_clock();
a597e79c
CH
5504 cookie->type = type;
5505}
5506
5507void
5508bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5509{
5510 assert(cookie->type < BDRV_MAX_IOTYPE);
5511
5512 bs->nr_bytes[cookie->type] += cookie->bytes;
5513 bs->nr_ops[cookie->type]++;
c488c7f6 5514 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
a597e79c
CH
5515}
5516
d92ada22
LC
5517void bdrv_img_create(const char *filename, const char *fmt,
5518 const char *base_filename, const char *base_fmt,
f382d43a
MR
5519 char *options, uint64_t img_size, int flags,
5520 Error **errp, bool quiet)
f88e1a42
JS
5521{
5522 QEMUOptionParameter *param = NULL, *create_options = NULL;
d220894e 5523 QEMUOptionParameter *backing_fmt, *backing_file, *size;
f88e1a42 5524 BlockDriver *drv, *proto_drv;
96df67d1 5525 BlockDriver *backing_drv = NULL;
cc84d90f 5526 Error *local_err = NULL;
f88e1a42
JS
5527 int ret = 0;
5528
5529 /* Find driver and parse its options */
5530 drv = bdrv_find_format(fmt);
5531 if (!drv) {
71c79813 5532 error_setg(errp, "Unknown file format '%s'", fmt);
d92ada22 5533 return;
f88e1a42
JS
5534 }
5535
98289620 5536 proto_drv = bdrv_find_protocol(filename, true);
f88e1a42 5537 if (!proto_drv) {
71c79813 5538 error_setg(errp, "Unknown protocol '%s'", filename);
d92ada22 5539 return;
f88e1a42
JS
5540 }
5541
5542 create_options = append_option_parameters(create_options,
5543 drv->create_options);
5544 create_options = append_option_parameters(create_options,
5545 proto_drv->create_options);
5546
5547 /* Create parameter list with default values */
5548 param = parse_option_parameters("", create_options, param);
5549
5550 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
5551
5552 /* Parse -o options */
5553 if (options) {
5554 param = parse_option_parameters(options, create_options, param);
5555 if (param == NULL) {
71c79813 5556 error_setg(errp, "Invalid options for file format '%s'.", fmt);
f88e1a42
JS
5557 goto out;
5558 }
5559 }
5560
5561 if (base_filename) {
5562 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
5563 base_filename)) {
71c79813
LC
5564 error_setg(errp, "Backing file not supported for file format '%s'",
5565 fmt);
f88e1a42
JS
5566 goto out;
5567 }
5568 }
5569
5570 if (base_fmt) {
5571 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
71c79813
LC
5572 error_setg(errp, "Backing file format not supported for file "
5573 "format '%s'", fmt);
f88e1a42
JS
5574 goto out;
5575 }
5576 }
5577
792da93a
JS
5578 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
5579 if (backing_file && backing_file->value.s) {
5580 if (!strcmp(filename, backing_file->value.s)) {
71c79813
LC
5581 error_setg(errp, "Error: Trying to create an image with the "
5582 "same filename as the backing file");
792da93a
JS
5583 goto out;
5584 }
5585 }
5586
f88e1a42
JS
5587 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
5588 if (backing_fmt && backing_fmt->value.s) {
96df67d1
SH
5589 backing_drv = bdrv_find_format(backing_fmt->value.s);
5590 if (!backing_drv) {
71c79813
LC
5591 error_setg(errp, "Unknown backing file format '%s'",
5592 backing_fmt->value.s);
f88e1a42
JS
5593 goto out;
5594 }
5595 }
5596
5597 // The size for the image must always be specified, with one exception:
5598 // If we are using a backing file, we can obtain the size from there
d220894e
KW
5599 size = get_option_parameter(param, BLOCK_OPT_SIZE);
5600 if (size && size->value.n == -1) {
f88e1a42 5601 if (backing_file && backing_file->value.s) {
66f6b814 5602 BlockDriverState *bs;
f88e1a42 5603 uint64_t size;
f88e1a42 5604 char buf[32];
63090dac
PB
5605 int back_flags;
5606
5607 /* backing files always opened read-only */
5608 back_flags =
5609 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
f88e1a42 5610
f67503e5 5611 bs = NULL;
ddf5636d 5612 ret = bdrv_open(&bs, backing_file->value.s, NULL, NULL, back_flags,
cc84d90f 5613 backing_drv, &local_err);
f88e1a42 5614 if (ret < 0) {
cc84d90f
HR
5615 error_setg_errno(errp, -ret, "Could not open '%s': %s",
5616 backing_file->value.s,
5617 error_get_pretty(local_err));
5618 error_free(local_err);
5619 local_err = NULL;
f88e1a42
JS
5620 goto out;
5621 }
5622 bdrv_get_geometry(bs, &size);
5623 size *= 512;
5624
5625 snprintf(buf, sizeof(buf), "%" PRId64, size);
5626 set_option_parameter(param, BLOCK_OPT_SIZE, buf);
66f6b814
HR
5627
5628 bdrv_unref(bs);
f88e1a42 5629 } else {
71c79813 5630 error_setg(errp, "Image creation needs a size parameter");
f88e1a42
JS
5631 goto out;
5632 }
5633 }
5634
f382d43a
MR
5635 if (!quiet) {
5636 printf("Formatting '%s', fmt=%s ", filename, fmt);
5637 print_option_parameters(param);
5638 puts("");
5639 }
cc84d90f
HR
5640 ret = bdrv_create(drv, filename, param, &local_err);
5641 if (ret == -EFBIG) {
5642 /* This is generally a better message than whatever the driver would
5643 * deliver (especially because of the cluster_size_hint), since that
5644 * is most probably not much different from "image too large". */
5645 const char *cluster_size_hint = "";
5646 if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
5647 cluster_size_hint = " (try using a larger cluster size)";
f88e1a42 5648 }
cc84d90f
HR
5649 error_setg(errp, "The image size is too large for file format '%s'"
5650 "%s", fmt, cluster_size_hint);
5651 error_free(local_err);
5652 local_err = NULL;
f88e1a42
JS
5653 }
5654
5655out:
5656 free_option_parameters(create_options);
5657 free_option_parameters(param);
5658
84d18f06 5659 if (local_err) {
cc84d90f
HR
5660 error_propagate(errp, local_err);
5661 }
f88e1a42 5662}
85d126f3
SH
5663
5664AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5665{
dcd04228
SH
5666 return bs->aio_context;
5667}
5668
5669void bdrv_detach_aio_context(BlockDriverState *bs)
5670{
5671 if (!bs->drv) {
5672 return;
5673 }
5674
13af91eb
SH
5675 if (bs->io_limits_enabled) {
5676 throttle_detach_aio_context(&bs->throttle_state);
5677 }
dcd04228
SH
5678 if (bs->drv->bdrv_detach_aio_context) {
5679 bs->drv->bdrv_detach_aio_context(bs);
5680 }
5681 if (bs->file) {
5682 bdrv_detach_aio_context(bs->file);
5683 }
5684 if (bs->backing_hd) {
5685 bdrv_detach_aio_context(bs->backing_hd);
5686 }
5687
5688 bs->aio_context = NULL;
5689}
5690
5691void bdrv_attach_aio_context(BlockDriverState *bs,
5692 AioContext *new_context)
5693{
5694 if (!bs->drv) {
5695 return;
5696 }
5697
5698 bs->aio_context = new_context;
5699
5700 if (bs->backing_hd) {
5701 bdrv_attach_aio_context(bs->backing_hd, new_context);
5702 }
5703 if (bs->file) {
5704 bdrv_attach_aio_context(bs->file, new_context);
5705 }
5706 if (bs->drv->bdrv_attach_aio_context) {
5707 bs->drv->bdrv_attach_aio_context(bs, new_context);
5708 }
13af91eb
SH
5709 if (bs->io_limits_enabled) {
5710 throttle_attach_aio_context(&bs->throttle_state, new_context);
5711 }
dcd04228
SH
5712}
5713
5714void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5715{
5716 bdrv_drain_all(); /* ensure there are no in-flight requests */
5717
5718 bdrv_detach_aio_context(bs);
5719
5720 /* This function executes in the old AioContext so acquire the new one in
5721 * case it runs in a different thread.
5722 */
5723 aio_context_acquire(new_context);
5724 bdrv_attach_aio_context(bs, new_context);
5725 aio_context_release(new_context);
85d126f3 5726}
d616b224
SH
5727
5728void bdrv_add_before_write_notifier(BlockDriverState *bs,
5729 NotifierWithReturn *notifier)
5730{
5731 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5732}
6f176b48
HR
5733
5734int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
5735{
5736 if (bs->drv->bdrv_amend_options == NULL) {
5737 return -ENOTSUP;
5738 }
5739 return bs->drv->bdrv_amend_options(bs, options);
5740}
f6186f49 5741
b5042a36
BC
5742/* This function will be called by the bdrv_recurse_is_first_non_filter method
5743 * of block filter and by bdrv_is_first_non_filter.
5744 * It is used to test if the given bs is the candidate or recurse more in the
5745 * node graph.
212a5a8f 5746 */
b5042a36 5747bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
212a5a8f 5748 BlockDriverState *candidate)
f6186f49 5749{
b5042a36
BC
5750 /* return false if basic checks fails */
5751 if (!bs || !bs->drv) {
212a5a8f 5752 return false;
f6186f49
BC
5753 }
5754
b5042a36
BC
5755 /* the code reached a non block filter driver -> check if the bs is
5756 * the same as the candidate. It's the recursion termination condition.
5757 */
5758 if (!bs->drv->is_filter) {
5759 return bs == candidate;
212a5a8f 5760 }
b5042a36 5761 /* Down this path the driver is a block filter driver */
212a5a8f 5762
b5042a36
BC
5763 /* If the block filter recursion method is defined use it to recurse down
5764 * the node graph.
5765 */
5766 if (bs->drv->bdrv_recurse_is_first_non_filter) {
212a5a8f 5767 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
f6186f49
BC
5768 }
5769
b5042a36
BC
5770 /* the driver is a block filter but don't allow to recurse -> return false
5771 */
5772 return false;
f6186f49
BC
5773}
5774
212a5a8f
BC
5775/* This function checks if the candidate is the first non filter bs down it's
5776 * bs chain. Since we don't have pointers to parents it explore all bs chains
5777 * from the top. Some filters can choose not to pass down the recursion.
5778 */
5779bool bdrv_is_first_non_filter(BlockDriverState *candidate)
f6186f49 5780{
212a5a8f
BC
5781 BlockDriverState *bs;
5782
5783 /* walk down the bs forest recursively */
5784 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5785 bool perm;
5786
b5042a36 5787 /* try to recurse in this top level bs */
e6dc8a1f 5788 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
212a5a8f
BC
5789
5790 /* candidate is the first non filter */
5791 if (perm) {
5792 return true;
5793 }
5794 }
5795
5796 return false;
f6186f49 5797}