]> git.proxmox.com Git - mirror_qemu.git/blame - block.c
block: Add bitmap disabled status
[mirror_qemu.git] / block.c
CommitLineData
fc01f7e7
FB
1/*
2 * QEMU System Emulator block driver
5fafdf24 3 *
fc01f7e7 4 * Copyright (c) 2003 Fabrice Bellard
5fafdf24 5 *
fc01f7e7
FB
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
3990d09a 24#include "config-host.h"
faf07963 25#include "qemu-common.h"
6d519a5f 26#include "trace.h"
737e150e
PB
27#include "block/block_int.h"
28#include "block/blockjob.h"
1de7afc9 29#include "qemu/module.h"
7b1b5d19 30#include "qapi/qmp/qjson.h"
bfb197e0 31#include "sysemu/block-backend.h"
9c17d615 32#include "sysemu/sysemu.h"
de50a20a 33#include "sysemu/qtest.h"
1de7afc9 34#include "qemu/notify.h"
737e150e 35#include "block/coroutine.h"
c13163fb 36#include "block/qapi.h"
b2023818 37#include "qmp-commands.h"
1de7afc9 38#include "qemu/timer.h"
a5ee7bd4 39#include "qapi-event.h"
fc01f7e7 40
71e72a19 41#ifdef CONFIG_BSD
7674e7bf
FB
42#include <sys/types.h>
43#include <sys/stat.h>
44#include <sys/ioctl.h>
72cf2d4f 45#include <sys/queue.h>
c5e97233 46#ifndef __DragonFly__
7674e7bf
FB
47#include <sys/disk.h>
48#endif
c5e97233 49#endif
7674e7bf 50
49dc768d
AL
51#ifdef _WIN32
52#include <windows.h>
53#endif
54
e4654d2d
FZ
55struct BdrvDirtyBitmap {
56 HBitmap *bitmap;
0db6e54a 57 char *name;
b8e6fb75 58 bool disabled;
e4654d2d
FZ
59 QLIST_ENTRY(BdrvDirtyBitmap) list;
60};
61
1c9805a3
SH
62#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
63
7c84b1b8 64static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
f141eafe 65 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
097310b5 66 BlockCompletionFunc *cb, void *opaque);
7c84b1b8 67static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
f141eafe 68 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
097310b5 69 BlockCompletionFunc *cb, void *opaque);
f9f05dc5
KW
70static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
71 int64_t sector_num, int nb_sectors,
72 QEMUIOVector *iov);
73static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
74 int64_t sector_num, int nb_sectors,
75 QEMUIOVector *iov);
775aa8b6
KW
76static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
77 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
470c0504 78 BdrvRequestFlags flags);
775aa8b6
KW
79static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
80 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
f08f2dda 81 BdrvRequestFlags flags);
7c84b1b8
MA
82static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
83 int64_t sector_num,
84 QEMUIOVector *qiov,
85 int nb_sectors,
86 BdrvRequestFlags flags,
097310b5 87 BlockCompletionFunc *cb,
7c84b1b8
MA
88 void *opaque,
89 bool is_write);
b2a61371 90static void coroutine_fn bdrv_co_do_rw(void *opaque);
621f0589 91static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
aa7bfbff 92 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
ec530c81 93
1b7bdbc1
SH
94static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
95 QTAILQ_HEAD_INITIALIZER(bdrv_states);
7ee930d0 96
dc364f4c
BC
97static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
98 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
99
8a22f02a
SH
100static QLIST_HEAD(, BlockDriver) bdrv_drivers =
101 QLIST_HEAD_INITIALIZER(bdrv_drivers);
ea2384d3 102
c4237dfa
VSO
103static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
104 int nr_sectors);
105static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
106 int nr_sectors);
eb852011
MA
107/* If non-zero, use only whitelisted block drivers */
108static int use_bdrv_whitelist;
109
9e0b22f4
SH
110#ifdef _WIN32
111static int is_windows_drive_prefix(const char *filename)
112{
113 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
114 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
115 filename[1] == ':');
116}
117
118int is_windows_drive(const char *filename)
119{
120 if (is_windows_drive_prefix(filename) &&
121 filename[2] == '\0')
122 return 1;
123 if (strstart(filename, "\\\\.\\", NULL) ||
124 strstart(filename, "//./", NULL))
125 return 1;
126 return 0;
127}
128#endif
129
0563e191 130/* throttling disk I/O limits */
cc0681c4
BC
131void bdrv_set_io_limits(BlockDriverState *bs,
132 ThrottleConfig *cfg)
98f90dba 133{
cc0681c4 134 int i;
98f90dba 135
cc0681c4 136 throttle_config(&bs->throttle_state, cfg);
98f90dba 137
cc0681c4
BC
138 for (i = 0; i < 2; i++) {
139 qemu_co_enter_next(&bs->throttled_reqs[i]);
98f90dba 140 }
cc0681c4
BC
141}
142
143/* this function drain all the throttled IOs */
144static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
145{
146 bool drained = false;
147 bool enabled = bs->io_limits_enabled;
148 int i;
149
150 bs->io_limits_enabled = false;
151
152 for (i = 0; i < 2; i++) {
153 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
154 drained = true;
155 }
156 }
157
158 bs->io_limits_enabled = enabled;
98f90dba 159
cc0681c4 160 return drained;
98f90dba
ZYW
161}
162
cc0681c4 163void bdrv_io_limits_disable(BlockDriverState *bs)
0563e191 164{
cc0681c4 165 bs->io_limits_enabled = false;
0563e191 166
cc0681c4
BC
167 bdrv_start_throttled_reqs(bs);
168
169 throttle_destroy(&bs->throttle_state);
0563e191
ZYW
170}
171
cc0681c4 172static void bdrv_throttle_read_timer_cb(void *opaque)
0563e191 173{
cc0681c4
BC
174 BlockDriverState *bs = opaque;
175 qemu_co_enter_next(&bs->throttled_reqs[0]);
0563e191
ZYW
176}
177
cc0681c4 178static void bdrv_throttle_write_timer_cb(void *opaque)
0563e191 179{
cc0681c4
BC
180 BlockDriverState *bs = opaque;
181 qemu_co_enter_next(&bs->throttled_reqs[1]);
0563e191
ZYW
182}
183
cc0681c4
BC
184/* should be called before bdrv_set_io_limits if a limit is set */
185void bdrv_io_limits_enable(BlockDriverState *bs)
186{
de50a20a
FZ
187 int clock_type = QEMU_CLOCK_REALTIME;
188
189 if (qtest_enabled()) {
190 /* For testing block IO throttling only */
191 clock_type = QEMU_CLOCK_VIRTUAL;
192 }
cc0681c4
BC
193 assert(!bs->io_limits_enabled);
194 throttle_init(&bs->throttle_state,
13af91eb 195 bdrv_get_aio_context(bs),
de50a20a 196 clock_type,
cc0681c4
BC
197 bdrv_throttle_read_timer_cb,
198 bdrv_throttle_write_timer_cb,
199 bs);
200 bs->io_limits_enabled = true;
201}
202
203/* This function makes an IO wait if needed
204 *
205 * @nb_sectors: the number of sectors of the IO
206 * @is_write: is the IO a write
207 */
98f90dba 208static void bdrv_io_limits_intercept(BlockDriverState *bs,
d5103588 209 unsigned int bytes,
cc0681c4 210 bool is_write)
98f90dba 211{
cc0681c4
BC
212 /* does this io must wait */
213 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
98f90dba 214
cc0681c4
BC
215 /* if must wait or any request of this type throttled queue the IO */
216 if (must_wait ||
217 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
218 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
98f90dba
ZYW
219 }
220
cc0681c4 221 /* the IO will be executed, do the accounting */
d5103588
KW
222 throttle_account(&bs->throttle_state, is_write, bytes);
223
98f90dba 224
cc0681c4
BC
225 /* if the next request must wait -> do nothing */
226 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
227 return;
98f90dba
ZYW
228 }
229
cc0681c4
BC
230 /* else queue next request for execution */
231 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
98f90dba
ZYW
232}
233
339064d5
KW
234size_t bdrv_opt_mem_align(BlockDriverState *bs)
235{
236 if (!bs || !bs->drv) {
237 /* 4k should be on the safe side */
238 return 4096;
239 }
240
241 return bs->bl.opt_mem_alignment;
242}
243
9e0b22f4 244/* check if the path starts with "<protocol>:" */
5c98415b 245int path_has_protocol(const char *path)
9e0b22f4 246{
947995c0
PB
247 const char *p;
248
9e0b22f4
SH
249#ifdef _WIN32
250 if (is_windows_drive(path) ||
251 is_windows_drive_prefix(path)) {
252 return 0;
253 }
947995c0
PB
254 p = path + strcspn(path, ":/\\");
255#else
256 p = path + strcspn(path, ":/");
9e0b22f4
SH
257#endif
258
947995c0 259 return *p == ':';
9e0b22f4
SH
260}
261
83f64091 262int path_is_absolute(const char *path)
3b0d4f61 263{
21664424
FB
264#ifdef _WIN32
265 /* specific case for names like: "\\.\d:" */
f53f4da9 266 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
21664424 267 return 1;
f53f4da9
PB
268 }
269 return (*path == '/' || *path == '\\');
3b9f94e1 270#else
f53f4da9 271 return (*path == '/');
3b9f94e1 272#endif
3b0d4f61
FB
273}
274
83f64091
FB
275/* if filename is absolute, just copy it to dest. Otherwise, build a
276 path to it by considering it is relative to base_path. URL are
277 supported. */
278void path_combine(char *dest, int dest_size,
279 const char *base_path,
280 const char *filename)
3b0d4f61 281{
83f64091
FB
282 const char *p, *p1;
283 int len;
284
285 if (dest_size <= 0)
286 return;
287 if (path_is_absolute(filename)) {
288 pstrcpy(dest, dest_size, filename);
289 } else {
290 p = strchr(base_path, ':');
291 if (p)
292 p++;
293 else
294 p = base_path;
3b9f94e1
FB
295 p1 = strrchr(base_path, '/');
296#ifdef _WIN32
297 {
298 const char *p2;
299 p2 = strrchr(base_path, '\\');
300 if (!p1 || p2 > p1)
301 p1 = p2;
302 }
303#endif
83f64091
FB
304 if (p1)
305 p1++;
306 else
307 p1 = base_path;
308 if (p1 > p)
309 p = p1;
310 len = p - base_path;
311 if (len > dest_size - 1)
312 len = dest_size - 1;
313 memcpy(dest, base_path, len);
314 dest[len] = '\0';
315 pstrcat(dest, dest_size, filename);
3b0d4f61 316 }
3b0d4f61
FB
317}
318
0a82855a
HR
319void bdrv_get_full_backing_filename_from_filename(const char *backed,
320 const char *backing,
9f07429e
HR
321 char *dest, size_t sz,
322 Error **errp)
dc5a1371 323{
9f07429e
HR
324 if (backing[0] == '\0' || path_has_protocol(backing) ||
325 path_is_absolute(backing))
326 {
0a82855a 327 pstrcpy(dest, sz, backing);
9f07429e
HR
328 } else if (backed[0] == '\0' || strstart(backed, "json:", NULL)) {
329 error_setg(errp, "Cannot use relative backing file names for '%s'",
330 backed);
dc5a1371 331 } else {
0a82855a 332 path_combine(dest, sz, backed, backing);
dc5a1371
PB
333 }
334}
335
9f07429e
HR
336void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz,
337 Error **errp)
0a82855a 338{
9f07429e
HR
339 char *backed = bs->exact_filename[0] ? bs->exact_filename : bs->filename;
340
341 bdrv_get_full_backing_filename_from_filename(backed, bs->backing_file,
342 dest, sz, errp);
0a82855a
HR
343}
344
5efa9d5a 345void bdrv_register(BlockDriver *bdrv)
ea2384d3 346{
8c5873d6
SH
347 /* Block drivers without coroutine functions need emulation */
348 if (!bdrv->bdrv_co_readv) {
f9f05dc5
KW
349 bdrv->bdrv_co_readv = bdrv_co_readv_em;
350 bdrv->bdrv_co_writev = bdrv_co_writev_em;
351
f8c35c1d
SH
352 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
353 * the block driver lacks aio we need to emulate that too.
354 */
f9f05dc5
KW
355 if (!bdrv->bdrv_aio_readv) {
356 /* add AIO emulation layer */
357 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
358 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
f9f05dc5 359 }
83f64091 360 }
b2e12bc6 361
8a22f02a 362 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
ea2384d3 363}
b338082b 364
7f06d47e 365BlockDriverState *bdrv_new_root(void)
b338082b 366{
7f06d47e 367 BlockDriverState *bs = bdrv_new();
e4e9986b 368
e4e9986b 369 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
e4e9986b
MA
370 return bs;
371}
372
373BlockDriverState *bdrv_new(void)
374{
375 BlockDriverState *bs;
376 int i;
377
5839e53b 378 bs = g_new0(BlockDriverState, 1);
e4654d2d 379 QLIST_INIT(&bs->dirty_bitmaps);
fbe40ff7
FZ
380 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
381 QLIST_INIT(&bs->op_blockers[i]);
382 }
28a7282a 383 bdrv_iostatus_disable(bs);
d7d512f6 384 notifier_list_init(&bs->close_notifiers);
d616b224 385 notifier_with_return_list_init(&bs->before_write_notifiers);
cc0681c4
BC
386 qemu_co_queue_init(&bs->throttled_reqs[0]);
387 qemu_co_queue_init(&bs->throttled_reqs[1]);
9fcb0251 388 bs->refcnt = 1;
dcd04228 389 bs->aio_context = qemu_get_aio_context();
d7d512f6 390
b338082b
FB
391 return bs;
392}
393
d7d512f6
PB
394void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
395{
396 notifier_list_add(&bs->close_notifiers, notify);
397}
398
ea2384d3
FB
399BlockDriver *bdrv_find_format(const char *format_name)
400{
401 BlockDriver *drv1;
8a22f02a
SH
402 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
403 if (!strcmp(drv1->format_name, format_name)) {
ea2384d3 404 return drv1;
8a22f02a 405 }
ea2384d3
FB
406 }
407 return NULL;
408}
409
b64ec4e4 410static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
eb852011 411{
b64ec4e4
FZ
412 static const char *whitelist_rw[] = {
413 CONFIG_BDRV_RW_WHITELIST
414 };
415 static const char *whitelist_ro[] = {
416 CONFIG_BDRV_RO_WHITELIST
eb852011
MA
417 };
418 const char **p;
419
b64ec4e4 420 if (!whitelist_rw[0] && !whitelist_ro[0]) {
eb852011 421 return 1; /* no whitelist, anything goes */
b64ec4e4 422 }
eb852011 423
b64ec4e4 424 for (p = whitelist_rw; *p; p++) {
eb852011
MA
425 if (!strcmp(drv->format_name, *p)) {
426 return 1;
427 }
428 }
b64ec4e4
FZ
429 if (read_only) {
430 for (p = whitelist_ro; *p; p++) {
431 if (!strcmp(drv->format_name, *p)) {
432 return 1;
433 }
434 }
435 }
eb852011
MA
436 return 0;
437}
438
b64ec4e4
FZ
439BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
440 bool read_only)
eb852011
MA
441{
442 BlockDriver *drv = bdrv_find_format(format_name);
b64ec4e4 443 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
eb852011
MA
444}
445
5b7e1542
ZYW
446typedef struct CreateCo {
447 BlockDriver *drv;
448 char *filename;
83d0521a 449 QemuOpts *opts;
5b7e1542 450 int ret;
cc84d90f 451 Error *err;
5b7e1542
ZYW
452} CreateCo;
453
454static void coroutine_fn bdrv_create_co_entry(void *opaque)
455{
cc84d90f
HR
456 Error *local_err = NULL;
457 int ret;
458
5b7e1542
ZYW
459 CreateCo *cco = opaque;
460 assert(cco->drv);
461
c282e1fd 462 ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
84d18f06 463 if (local_err) {
cc84d90f
HR
464 error_propagate(&cco->err, local_err);
465 }
466 cco->ret = ret;
5b7e1542
ZYW
467}
468
0e7e1989 469int bdrv_create(BlockDriver *drv, const char* filename,
83d0521a 470 QemuOpts *opts, Error **errp)
ea2384d3 471{
5b7e1542
ZYW
472 int ret;
473
474 Coroutine *co;
475 CreateCo cco = {
476 .drv = drv,
477 .filename = g_strdup(filename),
83d0521a 478 .opts = opts,
5b7e1542 479 .ret = NOT_DONE,
cc84d90f 480 .err = NULL,
5b7e1542
ZYW
481 };
482
c282e1fd 483 if (!drv->bdrv_create) {
cc84d90f 484 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
80168bff
LC
485 ret = -ENOTSUP;
486 goto out;
5b7e1542
ZYW
487 }
488
489 if (qemu_in_coroutine()) {
490 /* Fast-path if already in coroutine context */
491 bdrv_create_co_entry(&cco);
492 } else {
493 co = qemu_coroutine_create(bdrv_create_co_entry);
494 qemu_coroutine_enter(co, &cco);
495 while (cco.ret == NOT_DONE) {
b47ec2c4 496 aio_poll(qemu_get_aio_context(), true);
5b7e1542
ZYW
497 }
498 }
499
500 ret = cco.ret;
cc84d90f 501 if (ret < 0) {
84d18f06 502 if (cco.err) {
cc84d90f
HR
503 error_propagate(errp, cco.err);
504 } else {
505 error_setg_errno(errp, -ret, "Could not create image");
506 }
507 }
0e7e1989 508
80168bff
LC
509out:
510 g_free(cco.filename);
5b7e1542 511 return ret;
ea2384d3
FB
512}
513
c282e1fd 514int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
84a12e66
CH
515{
516 BlockDriver *drv;
cc84d90f
HR
517 Error *local_err = NULL;
518 int ret;
84a12e66 519
b65a5e12 520 drv = bdrv_find_protocol(filename, true, errp);
84a12e66 521 if (drv == NULL) {
16905d71 522 return -ENOENT;
84a12e66
CH
523 }
524
c282e1fd 525 ret = bdrv_create(drv, filename, opts, &local_err);
84d18f06 526 if (local_err) {
cc84d90f
HR
527 error_propagate(errp, local_err);
528 }
529 return ret;
84a12e66
CH
530}
531
3baca891 532void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
d34682cd
KW
533{
534 BlockDriver *drv = bs->drv;
3baca891 535 Error *local_err = NULL;
d34682cd
KW
536
537 memset(&bs->bl, 0, sizeof(bs->bl));
538
466ad822 539 if (!drv) {
3baca891 540 return;
466ad822
KW
541 }
542
543 /* Take some limits from the children as a default */
544 if (bs->file) {
3baca891
KW
545 bdrv_refresh_limits(bs->file, &local_err);
546 if (local_err) {
547 error_propagate(errp, local_err);
548 return;
549 }
466ad822 550 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
2647fab5 551 bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
339064d5
KW
552 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
553 } else {
554 bs->bl.opt_mem_alignment = 512;
466ad822
KW
555 }
556
557 if (bs->backing_hd) {
3baca891
KW
558 bdrv_refresh_limits(bs->backing_hd, &local_err);
559 if (local_err) {
560 error_propagate(errp, local_err);
561 return;
562 }
466ad822
KW
563 bs->bl.opt_transfer_length =
564 MAX(bs->bl.opt_transfer_length,
565 bs->backing_hd->bl.opt_transfer_length);
2647fab5
PL
566 bs->bl.max_transfer_length =
567 MIN_NON_ZERO(bs->bl.max_transfer_length,
568 bs->backing_hd->bl.max_transfer_length);
339064d5
KW
569 bs->bl.opt_mem_alignment =
570 MAX(bs->bl.opt_mem_alignment,
571 bs->backing_hd->bl.opt_mem_alignment);
466ad822
KW
572 }
573
574 /* Then let the driver override it */
575 if (drv->bdrv_refresh_limits) {
3baca891 576 drv->bdrv_refresh_limits(bs, errp);
d34682cd 577 }
d34682cd
KW
578}
579
892b7de8
ET
580/**
581 * Try to get @bs's logical and physical block size.
582 * On success, store them in @bsz struct and return 0.
583 * On failure return -errno.
584 * @bs must not be empty.
585 */
586int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
587{
588 BlockDriver *drv = bs->drv;
589
590 if (drv && drv->bdrv_probe_blocksizes) {
591 return drv->bdrv_probe_blocksizes(bs, bsz);
592 }
593
594 return -ENOTSUP;
595}
596
597/**
598 * Try to get @bs's geometry (cyls, heads, sectors).
599 * On success, store them in @geo struct and return 0.
600 * On failure return -errno.
601 * @bs must not be empty.
602 */
603int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
604{
605 BlockDriver *drv = bs->drv;
606
607 if (drv && drv->bdrv_probe_geometry) {
608 return drv->bdrv_probe_geometry(bs, geo);
609 }
610
611 return -ENOTSUP;
612}
613
eba25057
JM
614/*
615 * Create a uniquely-named empty temporary file.
616 * Return 0 upon success, otherwise a negative errno value.
617 */
618int get_tmp_filename(char *filename, int size)
d5249393 619{
eba25057 620#ifdef _WIN32
3b9f94e1 621 char temp_dir[MAX_PATH];
eba25057
JM
622 /* GetTempFileName requires that its output buffer (4th param)
623 have length MAX_PATH or greater. */
624 assert(size >= MAX_PATH);
625 return (GetTempPath(MAX_PATH, temp_dir)
626 && GetTempFileName(temp_dir, "qem", 0, filename)
627 ? 0 : -GetLastError());
d5249393 628#else
67b915a5 629 int fd;
7ccfb2eb 630 const char *tmpdir;
0badc1ee 631 tmpdir = getenv("TMPDIR");
69bef793
AS
632 if (!tmpdir) {
633 tmpdir = "/var/tmp";
634 }
eba25057
JM
635 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
636 return -EOVERFLOW;
637 }
ea2384d3 638 fd = mkstemp(filename);
fe235a06
DH
639 if (fd < 0) {
640 return -errno;
641 }
642 if (close(fd) != 0) {
643 unlink(filename);
eba25057
JM
644 return -errno;
645 }
646 return 0;
d5249393 647#endif
eba25057 648}
fc01f7e7 649
84a12e66
CH
650/*
651 * Detect host devices. By convention, /dev/cdrom[N] is always
652 * recognized as a host CDROM.
653 */
654static BlockDriver *find_hdev_driver(const char *filename)
655{
656 int score_max = 0, score;
657 BlockDriver *drv = NULL, *d;
658
659 QLIST_FOREACH(d, &bdrv_drivers, list) {
660 if (d->bdrv_probe_device) {
661 score = d->bdrv_probe_device(filename);
662 if (score > score_max) {
663 score_max = score;
664 drv = d;
665 }
666 }
667 }
668
669 return drv;
670}
671
98289620 672BlockDriver *bdrv_find_protocol(const char *filename,
b65a5e12
HR
673 bool allow_protocol_prefix,
674 Error **errp)
83f64091
FB
675{
676 BlockDriver *drv1;
677 char protocol[128];
1cec71e3 678 int len;
83f64091 679 const char *p;
19cb3738 680
66f82cee
KW
681 /* TODO Drivers without bdrv_file_open must be specified explicitly */
682
39508e7a
CH
683 /*
684 * XXX(hch): we really should not let host device detection
685 * override an explicit protocol specification, but moving this
686 * later breaks access to device names with colons in them.
687 * Thanks to the brain-dead persistent naming schemes on udev-
688 * based Linux systems those actually are quite common.
689 */
690 drv1 = find_hdev_driver(filename);
691 if (drv1) {
692 return drv1;
693 }
694
98289620 695 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
ef810437 696 return &bdrv_file;
84a12e66 697 }
98289620 698
9e0b22f4
SH
699 p = strchr(filename, ':');
700 assert(p != NULL);
1cec71e3
AL
701 len = p - filename;
702 if (len > sizeof(protocol) - 1)
703 len = sizeof(protocol) - 1;
704 memcpy(protocol, filename, len);
705 protocol[len] = '\0';
8a22f02a 706 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
5fafdf24 707 if (drv1->protocol_name &&
8a22f02a 708 !strcmp(drv1->protocol_name, protocol)) {
83f64091 709 return drv1;
8a22f02a 710 }
83f64091 711 }
b65a5e12
HR
712
713 error_setg(errp, "Unknown protocol '%s'", protocol);
83f64091
FB
714 return NULL;
715}
716
c6684249
MA
717/*
718 * Guess image format by probing its contents.
719 * This is not a good idea when your image is raw (CVE-2008-2004), but
720 * we do it anyway for backward compatibility.
721 *
722 * @buf contains the image's first @buf_size bytes.
7cddd372
KW
723 * @buf_size is the buffer size in bytes (generally BLOCK_PROBE_BUF_SIZE,
724 * but can be smaller if the image file is smaller)
c6684249
MA
725 * @filename is its filename.
726 *
727 * For all block drivers, call the bdrv_probe() method to get its
728 * probing score.
729 * Return the first block driver with the highest probing score.
730 */
38f3ef57
KW
731BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
732 const char *filename)
c6684249
MA
733{
734 int score_max = 0, score;
735 BlockDriver *drv = NULL, *d;
736
737 QLIST_FOREACH(d, &bdrv_drivers, list) {
738 if (d->bdrv_probe) {
739 score = d->bdrv_probe(buf, buf_size, filename);
740 if (score > score_max) {
741 score_max = score;
742 drv = d;
743 }
744 }
745 }
746
747 return drv;
748}
749
f500a6d3 750static int find_image_format(BlockDriverState *bs, const char *filename,
34b5d2c6 751 BlockDriver **pdrv, Error **errp)
f3a5d3f8 752{
c6684249 753 BlockDriver *drv;
7cddd372 754 uint8_t buf[BLOCK_PROBE_BUF_SIZE];
f500a6d3 755 int ret = 0;
f8ea0b00 756
08a00559 757 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
8e895599 758 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
ef810437 759 *pdrv = &bdrv_raw;
c98ac35d 760 return ret;
1a396859 761 }
f8ea0b00 762
83f64091 763 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
83f64091 764 if (ret < 0) {
34b5d2c6
HR
765 error_setg_errno(errp, -ret, "Could not read image for determining its "
766 "format");
c98ac35d
SW
767 *pdrv = NULL;
768 return ret;
83f64091
FB
769 }
770
c6684249 771 drv = bdrv_probe_all(buf, ret, filename);
c98ac35d 772 if (!drv) {
34b5d2c6
HR
773 error_setg(errp, "Could not determine image format: No compatible "
774 "driver found");
c98ac35d
SW
775 ret = -ENOENT;
776 }
777 *pdrv = drv;
778 return ret;
ea2384d3
FB
779}
780
51762288
SH
781/**
782 * Set the current 'total_sectors' value
65a9bb25 783 * Return 0 on success, -errno on error.
51762288
SH
784 */
785static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
786{
787 BlockDriver *drv = bs->drv;
788
396759ad
NB
789 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
790 if (bs->sg)
791 return 0;
792
51762288
SH
793 /* query actual device if possible, otherwise just trust the hint */
794 if (drv->bdrv_getlength) {
795 int64_t length = drv->bdrv_getlength(bs);
796 if (length < 0) {
797 return length;
798 }
7e382003 799 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
51762288
SH
800 }
801
802 bs->total_sectors = hint;
803 return 0;
804}
805
9e8f1835
PB
806/**
807 * Set open flags for a given discard mode
808 *
809 * Return 0 on success, -1 if the discard mode was invalid.
810 */
811int bdrv_parse_discard_flags(const char *mode, int *flags)
812{
813 *flags &= ~BDRV_O_UNMAP;
814
815 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
816 /* do nothing */
817 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
818 *flags |= BDRV_O_UNMAP;
819 } else {
820 return -1;
821 }
822
823 return 0;
824}
825
c3993cdc
SH
826/**
827 * Set open flags for a given cache mode
828 *
829 * Return 0 on success, -1 if the cache mode was invalid.
830 */
831int bdrv_parse_cache_flags(const char *mode, int *flags)
832{
833 *flags &= ~BDRV_O_CACHE_MASK;
834
835 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
836 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
92196b2f
SH
837 } else if (!strcmp(mode, "directsync")) {
838 *flags |= BDRV_O_NOCACHE;
c3993cdc
SH
839 } else if (!strcmp(mode, "writeback")) {
840 *flags |= BDRV_O_CACHE_WB;
841 } else if (!strcmp(mode, "unsafe")) {
842 *flags |= BDRV_O_CACHE_WB;
843 *flags |= BDRV_O_NO_FLUSH;
844 } else if (!strcmp(mode, "writethrough")) {
845 /* this is the default */
846 } else {
847 return -1;
848 }
849
850 return 0;
851}
852
53fec9d3
SH
853/**
854 * The copy-on-read flag is actually a reference count so multiple users may
855 * use the feature without worrying about clobbering its previous state.
856 * Copy-on-read stays enabled until all users have called to disable it.
857 */
858void bdrv_enable_copy_on_read(BlockDriverState *bs)
859{
860 bs->copy_on_read++;
861}
862
863void bdrv_disable_copy_on_read(BlockDriverState *bs)
864{
865 assert(bs->copy_on_read > 0);
866 bs->copy_on_read--;
867}
868
b1e6fc08
KW
869/*
870 * Returns the flags that a temporary snapshot should get, based on the
871 * originally requested flags (the originally requested image will have flags
872 * like a backing file)
873 */
874static int bdrv_temp_snapshot_flags(int flags)
875{
876 return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
877}
878
0b50cc88
KW
879/*
880 * Returns the flags that bs->file should get, based on the given flags for
881 * the parent BDS
882 */
883static int bdrv_inherited_flags(int flags)
884{
885 /* Enable protocol handling, disable format probing for bs->file */
886 flags |= BDRV_O_PROTOCOL;
887
888 /* Our block drivers take care to send flushes and respect unmap policy,
889 * so we can enable both unconditionally on lower layers. */
890 flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
891
0b50cc88 892 /* Clear flags that only apply to the top layer */
5669b44d 893 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
0b50cc88
KW
894
895 return flags;
896}
897
317fc44e
KW
898/*
899 * Returns the flags that bs->backing_hd should get, based on the given flags
900 * for the parent BDS
901 */
902static int bdrv_backing_flags(int flags)
903{
904 /* backing files always opened read-only */
905 flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
906
907 /* snapshot=on is handled on the top layer */
8bfea15d 908 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
317fc44e
KW
909
910 return flags;
911}
912
7b272452
KW
913static int bdrv_open_flags(BlockDriverState *bs, int flags)
914{
915 int open_flags = flags | BDRV_O_CACHE_WB;
916
917 /*
918 * Clear flags that are internal to the block layer before opening the
919 * image.
920 */
20cca275 921 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
7b272452
KW
922
923 /*
924 * Snapshots should be writable.
925 */
8bfea15d 926 if (flags & BDRV_O_TEMPORARY) {
7b272452
KW
927 open_flags |= BDRV_O_RDWR;
928 }
929
930 return open_flags;
931}
932
636ea370
KW
933static void bdrv_assign_node_name(BlockDriverState *bs,
934 const char *node_name,
935 Error **errp)
6913c0c2
BC
936{
937 if (!node_name) {
636ea370 938 return;
6913c0c2
BC
939 }
940
9aebf3b8 941 /* Check for empty string or invalid characters */
f5bebbbb 942 if (!id_wellformed(node_name)) {
9aebf3b8 943 error_setg(errp, "Invalid node name");
636ea370 944 return;
6913c0c2
BC
945 }
946
0c5e94ee 947 /* takes care of avoiding namespaces collisions */
7f06d47e 948 if (blk_by_name(node_name)) {
0c5e94ee
BC
949 error_setg(errp, "node-name=%s is conflicting with a device id",
950 node_name);
636ea370 951 return;
0c5e94ee
BC
952 }
953
6913c0c2
BC
954 /* takes care of avoiding duplicates node names */
955 if (bdrv_find_node(node_name)) {
956 error_setg(errp, "Duplicate node name");
636ea370 957 return;
6913c0c2
BC
958 }
959
960 /* copy node name into the bs and insert it into the graph list */
961 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
962 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
6913c0c2
BC
963}
964
57915332
KW
965/*
966 * Common part for opening disk images and files
b6ad491a
KW
967 *
968 * Removes all processed options from *options.
57915332 969 */
f500a6d3 970static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
34b5d2c6 971 QDict *options, int flags, BlockDriver *drv, Error **errp)
57915332
KW
972{
973 int ret, open_flags;
035fccdf 974 const char *filename;
6913c0c2 975 const char *node_name = NULL;
34b5d2c6 976 Error *local_err = NULL;
57915332
KW
977
978 assert(drv != NULL);
6405875c 979 assert(bs->file == NULL);
707ff828 980 assert(options != NULL && bs->options != options);
57915332 981
45673671
KW
982 if (file != NULL) {
983 filename = file->filename;
984 } else {
985 filename = qdict_get_try_str(options, "filename");
986 }
987
765003db
KW
988 if (drv->bdrv_needs_filename && !filename) {
989 error_setg(errp, "The '%s' block driver requires a file name",
990 drv->format_name);
991 return -EINVAL;
992 }
993
45673671 994 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
28dcee10 995
6913c0c2 996 node_name = qdict_get_try_str(options, "node-name");
636ea370 997 bdrv_assign_node_name(bs, node_name, &local_err);
0fb6395c 998 if (local_err) {
636ea370
KW
999 error_propagate(errp, local_err);
1000 return -EINVAL;
6913c0c2
BC
1001 }
1002 qdict_del(options, "node-name");
1003
5d186eb0
KW
1004 /* bdrv_open() with directly using a protocol as drv. This layer is already
1005 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
1006 * and return immediately. */
1007 if (file != NULL && drv->bdrv_file_open) {
1008 bdrv_swap(file, bs);
1009 return 0;
1010 }
1011
57915332 1012 bs->open_flags = flags;
1b7fd729 1013 bs->guest_block_size = 512;
c25f53b0 1014 bs->request_alignment = 512;
0d51b4de 1015 bs->zero_beyond_eof = true;
b64ec4e4
FZ
1016 open_flags = bdrv_open_flags(bs, flags);
1017 bs->read_only = !(open_flags & BDRV_O_RDWR);
1018
1019 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
8f94a6e4
KW
1020 error_setg(errp,
1021 !bs->read_only && bdrv_is_whitelisted(drv, true)
1022 ? "Driver '%s' can only be used for read-only devices"
1023 : "Driver '%s' is not whitelisted",
1024 drv->format_name);
b64ec4e4
FZ
1025 return -ENOTSUP;
1026 }
57915332 1027
53fec9d3 1028 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
0ebd24e0
KW
1029 if (flags & BDRV_O_COPY_ON_READ) {
1030 if (!bs->read_only) {
1031 bdrv_enable_copy_on_read(bs);
1032 } else {
1033 error_setg(errp, "Can't use copy-on-read on read-only device");
1034 return -EINVAL;
1035 }
53fec9d3
SH
1036 }
1037
c2ad1b0c
KW
1038 if (filename != NULL) {
1039 pstrcpy(bs->filename, sizeof(bs->filename), filename);
1040 } else {
1041 bs->filename[0] = '\0';
1042 }
91af7014 1043 pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
57915332 1044
57915332 1045 bs->drv = drv;
7267c094 1046 bs->opaque = g_malloc0(drv->instance_size);
57915332 1047
03f541bd 1048 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
e7c63796 1049
66f82cee
KW
1050 /* Open the image, either directly or using a protocol */
1051 if (drv->bdrv_file_open) {
5d186eb0 1052 assert(file == NULL);
030be321 1053 assert(!drv->bdrv_needs_filename || filename != NULL);
34b5d2c6 1054 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
f500a6d3 1055 } else {
2af5ef70 1056 if (file == NULL) {
34b5d2c6
HR
1057 error_setg(errp, "Can't use '%s' as a block driver for the "
1058 "protocol level", drv->format_name);
2af5ef70
KW
1059 ret = -EINVAL;
1060 goto free_and_fail;
1061 }
f500a6d3 1062 bs->file = file;
34b5d2c6 1063 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
66f82cee
KW
1064 }
1065
57915332 1066 if (ret < 0) {
84d18f06 1067 if (local_err) {
34b5d2c6 1068 error_propagate(errp, local_err);
2fa9aa59
DH
1069 } else if (bs->filename[0]) {
1070 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
34b5d2c6
HR
1071 } else {
1072 error_setg_errno(errp, -ret, "Could not open image");
1073 }
57915332
KW
1074 goto free_and_fail;
1075 }
1076
a1f688f4
MA
1077 if (bs->encrypted) {
1078 error_report("Encrypted images are deprecated");
1079 error_printf("Support for them will be removed in a future release.\n"
1080 "You can use 'qemu-img convert' to convert your image"
1081 " to an unencrypted one.\n");
1082 }
1083
51762288
SH
1084 ret = refresh_total_sectors(bs, bs->total_sectors);
1085 if (ret < 0) {
34b5d2c6 1086 error_setg_errno(errp, -ret, "Could not refresh total sector count");
51762288 1087 goto free_and_fail;
57915332 1088 }
51762288 1089
3baca891
KW
1090 bdrv_refresh_limits(bs, &local_err);
1091 if (local_err) {
1092 error_propagate(errp, local_err);
1093 ret = -EINVAL;
1094 goto free_and_fail;
1095 }
1096
c25f53b0 1097 assert(bdrv_opt_mem_align(bs) != 0);
47ea2de2 1098 assert((bs->request_alignment != 0) || bs->sg);
57915332
KW
1099 return 0;
1100
1101free_and_fail:
f500a6d3 1102 bs->file = NULL;
7267c094 1103 g_free(bs->opaque);
57915332
KW
1104 bs->opaque = NULL;
1105 bs->drv = NULL;
1106 return ret;
1107}
1108
5e5c4f63
KW
1109static QDict *parse_json_filename(const char *filename, Error **errp)
1110{
1111 QObject *options_obj;
1112 QDict *options;
1113 int ret;
1114
1115 ret = strstart(filename, "json:", &filename);
1116 assert(ret);
1117
1118 options_obj = qobject_from_json(filename);
1119 if (!options_obj) {
1120 error_setg(errp, "Could not parse the JSON options");
1121 return NULL;
1122 }
1123
1124 if (qobject_type(options_obj) != QTYPE_QDICT) {
1125 qobject_decref(options_obj);
1126 error_setg(errp, "Invalid JSON object given");
1127 return NULL;
1128 }
1129
1130 options = qobject_to_qdict(options_obj);
1131 qdict_flatten(options);
1132
1133 return options;
1134}
1135
b6ce07aa 1136/*
f54120ff
KW
1137 * Fills in default options for opening images and converts the legacy
1138 * filename/flags pair to option QDict entries.
b6ce07aa 1139 */
5e5c4f63 1140static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
17b005f1 1141 BlockDriver *drv, Error **errp)
ea2384d3 1142{
5e5c4f63 1143 const char *filename = *pfilename;
c2ad1b0c 1144 const char *drvname;
462f5bcf 1145 bool protocol = flags & BDRV_O_PROTOCOL;
e3fa4bfa 1146 bool parse_filename = false;
34b5d2c6 1147 Error *local_err = NULL;
83f64091 1148
5e5c4f63
KW
1149 /* Parse json: pseudo-protocol */
1150 if (filename && g_str_has_prefix(filename, "json:")) {
1151 QDict *json_options = parse_json_filename(filename, &local_err);
1152 if (local_err) {
1153 error_propagate(errp, local_err);
1154 return -EINVAL;
1155 }
1156
1157 /* Options given in the filename have lower priority than options
1158 * specified directly */
1159 qdict_join(*options, json_options, false);
1160 QDECREF(json_options);
1161 *pfilename = filename = NULL;
1162 }
1163
035fccdf 1164 /* Fetch the file name from the options QDict if necessary */
17b005f1 1165 if (protocol && filename) {
f54120ff
KW
1166 if (!qdict_haskey(*options, "filename")) {
1167 qdict_put(*options, "filename", qstring_from_str(filename));
1168 parse_filename = true;
1169 } else {
1170 error_setg(errp, "Can't specify 'file' and 'filename' options at "
1171 "the same time");
1172 return -EINVAL;
1173 }
035fccdf
KW
1174 }
1175
c2ad1b0c 1176 /* Find the right block driver */
f54120ff 1177 filename = qdict_get_try_str(*options, "filename");
5acd9d81 1178 drvname = qdict_get_try_str(*options, "driver");
f54120ff 1179
17b005f1
KW
1180 if (drv) {
1181 if (drvname) {
1182 error_setg(errp, "Driver specified twice");
1183 return -EINVAL;
1184 }
1185 drvname = drv->format_name;
1186 qdict_put(*options, "driver", qstring_from_str(drvname));
1187 } else {
1188 if (!drvname && protocol) {
1189 if (filename) {
b65a5e12 1190 drv = bdrv_find_protocol(filename, parse_filename, errp);
17b005f1 1191 if (!drv) {
17b005f1
KW
1192 return -EINVAL;
1193 }
1194
1195 drvname = drv->format_name;
1196 qdict_put(*options, "driver", qstring_from_str(drvname));
1197 } else {
1198 error_setg(errp, "Must specify either driver or file");
f54120ff
KW
1199 return -EINVAL;
1200 }
17b005f1
KW
1201 } else if (drvname) {
1202 drv = bdrv_find_format(drvname);
1203 if (!drv) {
1204 error_setg(errp, "Unknown driver '%s'", drvname);
1205 return -ENOENT;
1206 }
98289620 1207 }
c2ad1b0c
KW
1208 }
1209
17b005f1 1210 assert(drv || !protocol);
c2ad1b0c 1211
f54120ff 1212 /* Driver-specific filename parsing */
17b005f1 1213 if (drv && drv->bdrv_parse_filename && parse_filename) {
5acd9d81 1214 drv->bdrv_parse_filename(filename, *options, &local_err);
84d18f06 1215 if (local_err) {
34b5d2c6 1216 error_propagate(errp, local_err);
f54120ff 1217 return -EINVAL;
6963a30d 1218 }
cd5d031e
HR
1219
1220 if (!drv->bdrv_needs_filename) {
1221 qdict_del(*options, "filename");
cd5d031e 1222 }
6963a30d
KW
1223 }
1224
f54120ff
KW
1225 return 0;
1226}
1227
8d24cce1
FZ
1228void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1229{
1230
826b6ca0
FZ
1231 if (bs->backing_hd) {
1232 assert(bs->backing_blocker);
1233 bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1234 } else if (backing_hd) {
1235 error_setg(&bs->backing_blocker,
81e5f78a
AG
1236 "node is used as backing hd of '%s'",
1237 bdrv_get_device_or_node_name(bs));
826b6ca0
FZ
1238 }
1239
8d24cce1
FZ
1240 bs->backing_hd = backing_hd;
1241 if (!backing_hd) {
826b6ca0
FZ
1242 error_free(bs->backing_blocker);
1243 bs->backing_blocker = NULL;
8d24cce1
FZ
1244 goto out;
1245 }
1246 bs->open_flags &= ~BDRV_O_NO_BACKING;
1247 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1248 pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1249 backing_hd->drv ? backing_hd->drv->format_name : "");
826b6ca0
FZ
1250
1251 bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1252 /* Otherwise we won't be able to commit due to check in bdrv_commit */
bb00021d 1253 bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET,
826b6ca0 1254 bs->backing_blocker);
8d24cce1 1255out:
3baca891 1256 bdrv_refresh_limits(bs, NULL);
8d24cce1
FZ
1257}
1258
31ca6d07
KW
1259/*
1260 * Opens the backing file for a BlockDriverState if not yet open
1261 *
1262 * options is a QDict of options to pass to the block drivers, or NULL for an
1263 * empty set of options. The reference to the QDict is transferred to this
1264 * function (even on failure), so if the caller intends to reuse the dictionary,
1265 * it needs to use QINCREF() before calling bdrv_file_open.
1266 */
34b5d2c6 1267int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
9156df12 1268{
1ba4b6a5 1269 char *backing_filename = g_malloc0(PATH_MAX);
317fc44e 1270 int ret = 0;
8d24cce1 1271 BlockDriverState *backing_hd;
34b5d2c6 1272 Error *local_err = NULL;
9156df12
PB
1273
1274 if (bs->backing_hd != NULL) {
31ca6d07 1275 QDECREF(options);
1ba4b6a5 1276 goto free_exit;
9156df12
PB
1277 }
1278
31ca6d07
KW
1279 /* NULL means an empty set of options */
1280 if (options == NULL) {
1281 options = qdict_new();
1282 }
1283
9156df12 1284 bs->open_flags &= ~BDRV_O_NO_BACKING;
1cb6f506
KW
1285 if (qdict_haskey(options, "file.filename")) {
1286 backing_filename[0] = '\0';
1287 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
31ca6d07 1288 QDECREF(options);
1ba4b6a5 1289 goto free_exit;
dbecebdd 1290 } else {
9f07429e
HR
1291 bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX,
1292 &local_err);
1293 if (local_err) {
1294 ret = -EINVAL;
1295 error_propagate(errp, local_err);
1296 QDECREF(options);
1297 goto free_exit;
1298 }
9156df12
PB
1299 }
1300
8ee79e70
KW
1301 if (!bs->drv || !bs->drv->supports_backing) {
1302 ret = -EINVAL;
1303 error_setg(errp, "Driver doesn't support backing files");
1304 QDECREF(options);
1305 goto free_exit;
1306 }
1307
e4e9986b 1308 backing_hd = bdrv_new();
8d24cce1 1309
c5f6e493
KW
1310 if (bs->backing_format[0] != '\0' && !qdict_haskey(options, "driver")) {
1311 qdict_put(options, "driver", qstring_from_str(bs->backing_format));
9156df12
PB
1312 }
1313
f67503e5 1314 assert(bs->backing_hd == NULL);
8d24cce1 1315 ret = bdrv_open(&backing_hd,
ddf5636d 1316 *backing_filename ? backing_filename : NULL, NULL, options,
c5f6e493 1317 bdrv_backing_flags(bs->open_flags), NULL, &local_err);
9156df12 1318 if (ret < 0) {
8d24cce1
FZ
1319 bdrv_unref(backing_hd);
1320 backing_hd = NULL;
9156df12 1321 bs->open_flags |= BDRV_O_NO_BACKING;
b04b6b6e
FZ
1322 error_setg(errp, "Could not open backing file: %s",
1323 error_get_pretty(local_err));
1324 error_free(local_err);
1ba4b6a5 1325 goto free_exit;
9156df12 1326 }
8d24cce1 1327 bdrv_set_backing_hd(bs, backing_hd);
d80ac658 1328
1ba4b6a5
BC
1329free_exit:
1330 g_free(backing_filename);
1331 return ret;
9156df12
PB
1332}
1333
da557aac
HR
1334/*
1335 * Opens a disk image whose options are given as BlockdevRef in another block
1336 * device's options.
1337 *
da557aac
HR
1338 * If allow_none is true, no image will be opened if filename is false and no
1339 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1340 *
1341 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1342 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1343 * itself, all options starting with "${bdref_key}." are considered part of the
1344 * BlockdevRef.
1345 *
1346 * The BlockdevRef will be removed from the options QDict.
f67503e5
HR
1347 *
1348 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
da557aac
HR
1349 */
1350int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1351 QDict *options, const char *bdref_key, int flags,
f7d9fd8c 1352 bool allow_none, Error **errp)
da557aac
HR
1353{
1354 QDict *image_options;
1355 int ret;
1356 char *bdref_key_dot;
1357 const char *reference;
1358
f67503e5
HR
1359 assert(pbs);
1360 assert(*pbs == NULL);
1361
da557aac
HR
1362 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1363 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1364 g_free(bdref_key_dot);
1365
1366 reference = qdict_get_try_str(options, bdref_key);
1367 if (!filename && !reference && !qdict_size(image_options)) {
1368 if (allow_none) {
1369 ret = 0;
1370 } else {
1371 error_setg(errp, "A block device must be specified for \"%s\"",
1372 bdref_key);
1373 ret = -EINVAL;
1374 }
b20e61e0 1375 QDECREF(image_options);
da557aac
HR
1376 goto done;
1377 }
1378
f7d9fd8c 1379 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
da557aac
HR
1380
1381done:
1382 qdict_del(options, bdref_key);
1383 return ret;
1384}
1385
6b8aeca5 1386int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
b998875d
KW
1387{
1388 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1ba4b6a5 1389 char *tmp_filename = g_malloc0(PATH_MAX + 1);
b998875d 1390 int64_t total_size;
83d0521a 1391 QemuOpts *opts = NULL;
b998875d
KW
1392 QDict *snapshot_options;
1393 BlockDriverState *bs_snapshot;
1394 Error *local_err;
1395 int ret;
1396
1397 /* if snapshot, we create a temporary backing file and open it
1398 instead of opening 'filename' directly */
1399
1400 /* Get the required size from the image */
f187743a
KW
1401 total_size = bdrv_getlength(bs);
1402 if (total_size < 0) {
6b8aeca5 1403 ret = total_size;
f187743a 1404 error_setg_errno(errp, -total_size, "Could not get image size");
1ba4b6a5 1405 goto out;
f187743a 1406 }
b998875d
KW
1407
1408 /* Create the temporary image */
1ba4b6a5 1409 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
b998875d
KW
1410 if (ret < 0) {
1411 error_setg_errno(errp, -ret, "Could not get temporary filename");
1ba4b6a5 1412 goto out;
b998875d
KW
1413 }
1414
ef810437 1415 opts = qemu_opts_create(bdrv_qcow2.create_opts, NULL, 0,
c282e1fd 1416 &error_abort);
39101f25 1417 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size, &error_abort);
ef810437 1418 ret = bdrv_create(&bdrv_qcow2, tmp_filename, opts, &local_err);
83d0521a 1419 qemu_opts_del(opts);
b998875d
KW
1420 if (ret < 0) {
1421 error_setg_errno(errp, -ret, "Could not create temporary overlay "
1422 "'%s': %s", tmp_filename,
1423 error_get_pretty(local_err));
1424 error_free(local_err);
1ba4b6a5 1425 goto out;
b998875d
KW
1426 }
1427
1428 /* Prepare a new options QDict for the temporary file */
1429 snapshot_options = qdict_new();
1430 qdict_put(snapshot_options, "file.driver",
1431 qstring_from_str("file"));
1432 qdict_put(snapshot_options, "file.filename",
1433 qstring_from_str(tmp_filename));
1434
e4e9986b 1435 bs_snapshot = bdrv_new();
b998875d
KW
1436
1437 ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
ef810437 1438 flags, &bdrv_qcow2, &local_err);
b998875d
KW
1439 if (ret < 0) {
1440 error_propagate(errp, local_err);
1ba4b6a5 1441 goto out;
b998875d
KW
1442 }
1443
1444 bdrv_append(bs_snapshot, bs);
1ba4b6a5
BC
1445
1446out:
1447 g_free(tmp_filename);
6b8aeca5 1448 return ret;
b998875d
KW
1449}
1450
b6ce07aa
KW
1451/*
1452 * Opens a disk image (raw, qcow2, vmdk, ...)
de9c0cec
KW
1453 *
1454 * options is a QDict of options to pass to the block drivers, or NULL for an
1455 * empty set of options. The reference to the QDict belongs to the block layer
1456 * after the call (even on failure), so if the caller intends to reuse the
1457 * dictionary, it needs to use QINCREF() before calling bdrv_open.
f67503e5
HR
1458 *
1459 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1460 * If it is not NULL, the referenced BDS will be reused.
ddf5636d
HR
1461 *
1462 * The reference parameter may be used to specify an existing block device which
1463 * should be opened. If specified, neither options nor a filename may be given,
1464 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
b6ce07aa 1465 */
ddf5636d
HR
1466int bdrv_open(BlockDriverState **pbs, const char *filename,
1467 const char *reference, QDict *options, int flags,
1468 BlockDriver *drv, Error **errp)
ea2384d3 1469{
b6ce07aa 1470 int ret;
f67503e5 1471 BlockDriverState *file = NULL, *bs;
74fe54f2 1472 const char *drvname;
34b5d2c6 1473 Error *local_err = NULL;
b1e6fc08 1474 int snapshot_flags = 0;
712e7874 1475
f67503e5
HR
1476 assert(pbs);
1477
ddf5636d
HR
1478 if (reference) {
1479 bool options_non_empty = options ? qdict_size(options) : false;
1480 QDECREF(options);
1481
1482 if (*pbs) {
1483 error_setg(errp, "Cannot reuse an existing BDS when referencing "
1484 "another block device");
1485 return -EINVAL;
1486 }
1487
1488 if (filename || options_non_empty) {
1489 error_setg(errp, "Cannot reference an existing block device with "
1490 "additional options or a new filename");
1491 return -EINVAL;
1492 }
1493
1494 bs = bdrv_lookup_bs(reference, reference, errp);
1495 if (!bs) {
1496 return -ENODEV;
1497 }
1498 bdrv_ref(bs);
1499 *pbs = bs;
1500 return 0;
1501 }
1502
f67503e5
HR
1503 if (*pbs) {
1504 bs = *pbs;
1505 } else {
e4e9986b 1506 bs = bdrv_new();
f67503e5
HR
1507 }
1508
de9c0cec
KW
1509 /* NULL means an empty set of options */
1510 if (options == NULL) {
1511 options = qdict_new();
1512 }
1513
17b005f1 1514 ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
462f5bcf
KW
1515 if (local_err) {
1516 goto fail;
1517 }
1518
76c591b0
KW
1519 /* Find the right image format driver */
1520 drv = NULL;
1521 drvname = qdict_get_try_str(options, "driver");
1522 if (drvname) {
1523 drv = bdrv_find_format(drvname);
1524 qdict_del(options, "driver");
1525 if (!drv) {
1526 error_setg(errp, "Unknown driver: '%s'", drvname);
1527 ret = -EINVAL;
1528 goto fail;
1529 }
1530 }
1531
1532 assert(drvname || !(flags & BDRV_O_PROTOCOL));
1533 if (drv && !drv->bdrv_file_open) {
1534 /* If the user explicitly wants a format driver here, we'll need to add
1535 * another layer for the protocol in bs->file */
1536 flags &= ~BDRV_O_PROTOCOL;
1537 }
1538
de9c0cec 1539 bs->options = options;
b6ad491a 1540 options = qdict_clone_shallow(options);
de9c0cec 1541
f500a6d3 1542 /* Open image file without format layer */
f4788adc
KW
1543 if ((flags & BDRV_O_PROTOCOL) == 0) {
1544 if (flags & BDRV_O_RDWR) {
1545 flags |= BDRV_O_ALLOW_RDWR;
1546 }
1547 if (flags & BDRV_O_SNAPSHOT) {
1548 snapshot_flags = bdrv_temp_snapshot_flags(flags);
1549 flags = bdrv_backing_flags(flags);
1550 }
f500a6d3 1551
f4788adc
KW
1552 assert(file == NULL);
1553 ret = bdrv_open_image(&file, filename, options, "file",
1554 bdrv_inherited_flags(flags),
1555 true, &local_err);
1556 if (ret < 0) {
1557 goto fail;
1558 }
f500a6d3
KW
1559 }
1560
76c591b0 1561 /* Image format probing */
38f3ef57 1562 bs->probed = !drv;
76c591b0 1563 if (!drv && file) {
17b005f1
KW
1564 ret = find_image_format(file, filename, &drv, &local_err);
1565 if (ret < 0) {
8bfea15d 1566 goto fail;
2a05cbe4 1567 }
76c591b0 1568 } else if (!drv) {
17b005f1
KW
1569 error_setg(errp, "Must specify either driver or file");
1570 ret = -EINVAL;
8bfea15d 1571 goto fail;
ea2384d3 1572 }
b6ce07aa
KW
1573
1574 /* Open the image */
34b5d2c6 1575 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
b6ce07aa 1576 if (ret < 0) {
8bfea15d 1577 goto fail;
6987307c
CH
1578 }
1579
2a05cbe4 1580 if (file && (bs->file != file)) {
4f6fd349 1581 bdrv_unref(file);
f500a6d3
KW
1582 file = NULL;
1583 }
1584
b6ce07aa 1585 /* If there is a backing file, use it */
9156df12 1586 if ((flags & BDRV_O_NO_BACKING) == 0) {
31ca6d07
KW
1587 QDict *backing_options;
1588
5726d872 1589 qdict_extract_subqdict(options, &backing_options, "backing.");
34b5d2c6 1590 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
b6ce07aa 1591 if (ret < 0) {
b6ad491a 1592 goto close_and_fail;
b6ce07aa 1593 }
b6ce07aa
KW
1594 }
1595
91af7014
HR
1596 bdrv_refresh_filename(bs);
1597
b998875d
KW
1598 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1599 * temporary snapshot afterwards. */
b1e6fc08 1600 if (snapshot_flags) {
6b8aeca5 1601 ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
b998875d 1602 if (local_err) {
b998875d
KW
1603 goto close_and_fail;
1604 }
1605 }
1606
b6ad491a 1607 /* Check if any unknown options were used */
5acd9d81 1608 if (options && (qdict_size(options) != 0)) {
b6ad491a 1609 const QDictEntry *entry = qdict_first(options);
5acd9d81
HR
1610 if (flags & BDRV_O_PROTOCOL) {
1611 error_setg(errp, "Block protocol '%s' doesn't support the option "
1612 "'%s'", drv->format_name, entry->key);
1613 } else {
1614 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1615 "support the option '%s'", drv->format_name,
bfb197e0 1616 bdrv_get_device_name(bs), entry->key);
5acd9d81 1617 }
b6ad491a
KW
1618
1619 ret = -EINVAL;
1620 goto close_and_fail;
1621 }
b6ad491a 1622
b6ce07aa 1623 if (!bdrv_key_required(bs)) {
a7f53e26
MA
1624 if (bs->blk) {
1625 blk_dev_change_media_cb(bs->blk, true);
1626 }
c3adb58f
MA
1627 } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1628 && !runstate_check(RUN_STATE_INMIGRATE)
1629 && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1630 error_setg(errp,
1631 "Guest must be stopped for opening of encrypted image");
1632 ret = -EBUSY;
1633 goto close_and_fail;
b6ce07aa
KW
1634 }
1635
c3adb58f 1636 QDECREF(options);
f67503e5 1637 *pbs = bs;
b6ce07aa
KW
1638 return 0;
1639
8bfea15d 1640fail:
f500a6d3 1641 if (file != NULL) {
4f6fd349 1642 bdrv_unref(file);
f500a6d3 1643 }
de9c0cec 1644 QDECREF(bs->options);
b6ad491a 1645 QDECREF(options);
de9c0cec 1646 bs->options = NULL;
f67503e5
HR
1647 if (!*pbs) {
1648 /* If *pbs is NULL, a new BDS has been created in this function and
1649 needs to be freed now. Otherwise, it does not need to be closed,
1650 since it has not really been opened yet. */
1651 bdrv_unref(bs);
1652 }
84d18f06 1653 if (local_err) {
34b5d2c6
HR
1654 error_propagate(errp, local_err);
1655 }
b6ad491a 1656 return ret;
de9c0cec 1657
b6ad491a 1658close_and_fail:
f67503e5
HR
1659 /* See fail path, but now the BDS has to be always closed */
1660 if (*pbs) {
1661 bdrv_close(bs);
1662 } else {
1663 bdrv_unref(bs);
1664 }
b6ad491a 1665 QDECREF(options);
84d18f06 1666 if (local_err) {
34b5d2c6
HR
1667 error_propagate(errp, local_err);
1668 }
b6ce07aa
KW
1669 return ret;
1670}
1671
e971aa12
JC
1672typedef struct BlockReopenQueueEntry {
1673 bool prepared;
1674 BDRVReopenState state;
1675 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1676} BlockReopenQueueEntry;
1677
1678/*
1679 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1680 * reopen of multiple devices.
1681 *
1682 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1683 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1684 * be created and initialized. This newly created BlockReopenQueue should be
1685 * passed back in for subsequent calls that are intended to be of the same
1686 * atomic 'set'.
1687 *
1688 * bs is the BlockDriverState to add to the reopen queue.
1689 *
1690 * flags contains the open flags for the associated bs
1691 *
1692 * returns a pointer to bs_queue, which is either the newly allocated
1693 * bs_queue, or the existing bs_queue being used.
1694 *
1695 */
1696BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1697 BlockDriverState *bs, int flags)
1698{
1699 assert(bs != NULL);
1700
1701 BlockReopenQueueEntry *bs_entry;
1702 if (bs_queue == NULL) {
1703 bs_queue = g_new0(BlockReopenQueue, 1);
1704 QSIMPLEQ_INIT(bs_queue);
1705 }
1706
f1f25a2e
KW
1707 /* bdrv_open() masks this flag out */
1708 flags &= ~BDRV_O_PROTOCOL;
1709
e971aa12 1710 if (bs->file) {
f1f25a2e 1711 bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
e971aa12
JC
1712 }
1713
1714 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1715 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1716
1717 bs_entry->state.bs = bs;
1718 bs_entry->state.flags = flags;
1719
1720 return bs_queue;
1721}
1722
1723/*
1724 * Reopen multiple BlockDriverStates atomically & transactionally.
1725 *
1726 * The queue passed in (bs_queue) must have been built up previous
1727 * via bdrv_reopen_queue().
1728 *
1729 * Reopens all BDS specified in the queue, with the appropriate
1730 * flags. All devices are prepared for reopen, and failure of any
1731 * device will cause all device changes to be abandonded, and intermediate
1732 * data cleaned up.
1733 *
1734 * If all devices prepare successfully, then the changes are committed
1735 * to all devices.
1736 *
1737 */
1738int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1739{
1740 int ret = -1;
1741 BlockReopenQueueEntry *bs_entry, *next;
1742 Error *local_err = NULL;
1743
1744 assert(bs_queue != NULL);
1745
1746 bdrv_drain_all();
1747
1748 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1749 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1750 error_propagate(errp, local_err);
1751 goto cleanup;
1752 }
1753 bs_entry->prepared = true;
1754 }
1755
1756 /* If we reach this point, we have success and just need to apply the
1757 * changes
1758 */
1759 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1760 bdrv_reopen_commit(&bs_entry->state);
1761 }
1762
1763 ret = 0;
1764
1765cleanup:
1766 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1767 if (ret && bs_entry->prepared) {
1768 bdrv_reopen_abort(&bs_entry->state);
1769 }
1770 g_free(bs_entry);
1771 }
1772 g_free(bs_queue);
1773 return ret;
1774}
1775
1776
1777/* Reopen a single BlockDriverState with the specified flags. */
1778int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1779{
1780 int ret = -1;
1781 Error *local_err = NULL;
1782 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1783
1784 ret = bdrv_reopen_multiple(queue, &local_err);
1785 if (local_err != NULL) {
1786 error_propagate(errp, local_err);
1787 }
1788 return ret;
1789}
1790
1791
1792/*
1793 * Prepares a BlockDriverState for reopen. All changes are staged in the
1794 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1795 * the block driver layer .bdrv_reopen_prepare()
1796 *
1797 * bs is the BlockDriverState to reopen
1798 * flags are the new open flags
1799 * queue is the reopen queue
1800 *
1801 * Returns 0 on success, non-zero on error. On error errp will be set
1802 * as well.
1803 *
1804 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1805 * It is the responsibility of the caller to then call the abort() or
1806 * commit() for any other BDS that have been left in a prepare() state
1807 *
1808 */
1809int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1810 Error **errp)
1811{
1812 int ret = -1;
1813 Error *local_err = NULL;
1814 BlockDriver *drv;
1815
1816 assert(reopen_state != NULL);
1817 assert(reopen_state->bs->drv != NULL);
1818 drv = reopen_state->bs->drv;
1819
1820 /* if we are to stay read-only, do not allow permission change
1821 * to r/w */
1822 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1823 reopen_state->flags & BDRV_O_RDWR) {
81e5f78a
AG
1824 error_setg(errp, "Node '%s' is read only",
1825 bdrv_get_device_or_node_name(reopen_state->bs));
e971aa12
JC
1826 goto error;
1827 }
1828
1829
1830 ret = bdrv_flush(reopen_state->bs);
1831 if (ret) {
1832 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1833 strerror(-ret));
1834 goto error;
1835 }
1836
1837 if (drv->bdrv_reopen_prepare) {
1838 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1839 if (ret) {
1840 if (local_err != NULL) {
1841 error_propagate(errp, local_err);
1842 } else {
d8b6895f
LC
1843 error_setg(errp, "failed while preparing to reopen image '%s'",
1844 reopen_state->bs->filename);
e971aa12
JC
1845 }
1846 goto error;
1847 }
1848 } else {
1849 /* It is currently mandatory to have a bdrv_reopen_prepare()
1850 * handler for each supported drv. */
81e5f78a
AG
1851 error_setg(errp, "Block format '%s' used by node '%s' "
1852 "does not support reopening files", drv->format_name,
1853 bdrv_get_device_or_node_name(reopen_state->bs));
e971aa12
JC
1854 ret = -1;
1855 goto error;
1856 }
1857
1858 ret = 0;
1859
1860error:
1861 return ret;
1862}
1863
1864/*
1865 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1866 * makes them final by swapping the staging BlockDriverState contents into
1867 * the active BlockDriverState contents.
1868 */
1869void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1870{
1871 BlockDriver *drv;
1872
1873 assert(reopen_state != NULL);
1874 drv = reopen_state->bs->drv;
1875 assert(drv != NULL);
1876
1877 /* If there are any driver level actions to take */
1878 if (drv->bdrv_reopen_commit) {
1879 drv->bdrv_reopen_commit(reopen_state);
1880 }
1881
1882 /* set BDS specific flags now */
1883 reopen_state->bs->open_flags = reopen_state->flags;
1884 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1885 BDRV_O_CACHE_WB);
1886 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
355ef4ac 1887
3baca891 1888 bdrv_refresh_limits(reopen_state->bs, NULL);
e971aa12
JC
1889}
1890
1891/*
1892 * Abort the reopen, and delete and free the staged changes in
1893 * reopen_state
1894 */
1895void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1896{
1897 BlockDriver *drv;
1898
1899 assert(reopen_state != NULL);
1900 drv = reopen_state->bs->drv;
1901 assert(drv != NULL);
1902
1903 if (drv->bdrv_reopen_abort) {
1904 drv->bdrv_reopen_abort(reopen_state);
1905 }
1906}
1907
1908
fc01f7e7
FB
1909void bdrv_close(BlockDriverState *bs)
1910{
33384421
HR
1911 BdrvAioNotifier *ban, *ban_next;
1912
3cbc002c
PB
1913 if (bs->job) {
1914 block_job_cancel_sync(bs->job);
1915 }
58fda173
SH
1916 bdrv_drain_all(); /* complete I/O */
1917 bdrv_flush(bs);
1918 bdrv_drain_all(); /* in case flush left pending I/O */
d7d512f6 1919 notifier_list_notify(&bs->close_notifiers, bs);
7094f12f 1920
3cbc002c 1921 if (bs->drv) {
557df6ac 1922 if (bs->backing_hd) {
826b6ca0
FZ
1923 BlockDriverState *backing_hd = bs->backing_hd;
1924 bdrv_set_backing_hd(bs, NULL);
1925 bdrv_unref(backing_hd);
557df6ac 1926 }
ea2384d3 1927 bs->drv->bdrv_close(bs);
7267c094 1928 g_free(bs->opaque);
ea2384d3
FB
1929 bs->opaque = NULL;
1930 bs->drv = NULL;
53fec9d3 1931 bs->copy_on_read = 0;
a275fa42
PB
1932 bs->backing_file[0] = '\0';
1933 bs->backing_format[0] = '\0';
6405875c
PB
1934 bs->total_sectors = 0;
1935 bs->encrypted = 0;
1936 bs->valid_key = 0;
1937 bs->sg = 0;
0d51b4de 1938 bs->zero_beyond_eof = false;
de9c0cec
KW
1939 QDECREF(bs->options);
1940 bs->options = NULL;
91af7014
HR
1941 QDECREF(bs->full_open_options);
1942 bs->full_open_options = NULL;
b338082b 1943
66f82cee 1944 if (bs->file != NULL) {
4f6fd349 1945 bdrv_unref(bs->file);
0ac9377d 1946 bs->file = NULL;
66f82cee 1947 }
b338082b 1948 }
98f90dba 1949
a7f53e26
MA
1950 if (bs->blk) {
1951 blk_dev_change_media_cb(bs->blk, false);
1952 }
9ca11154 1953
98f90dba
ZYW
1954 /*throttling disk I/O limits*/
1955 if (bs->io_limits_enabled) {
1956 bdrv_io_limits_disable(bs);
1957 }
33384421
HR
1958
1959 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1960 g_free(ban);
1961 }
1962 QLIST_INIT(&bs->aio_notifiers);
b338082b
FB
1963}
1964
2bc93fed
MK
1965void bdrv_close_all(void)
1966{
1967 BlockDriverState *bs;
1968
dc364f4c 1969 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
1970 AioContext *aio_context = bdrv_get_aio_context(bs);
1971
1972 aio_context_acquire(aio_context);
2bc93fed 1973 bdrv_close(bs);
ed78cda3 1974 aio_context_release(aio_context);
2bc93fed
MK
1975 }
1976}
1977
88266f5a
SH
1978/* Check if any requests are in-flight (including throttled requests) */
1979static bool bdrv_requests_pending(BlockDriverState *bs)
1980{
1981 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1982 return true;
1983 }
cc0681c4
BC
1984 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1985 return true;
1986 }
1987 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
88266f5a
SH
1988 return true;
1989 }
1990 if (bs->file && bdrv_requests_pending(bs->file)) {
1991 return true;
1992 }
1993 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1994 return true;
1995 }
1996 return false;
1997}
1998
5b98db0a
SH
1999static bool bdrv_drain_one(BlockDriverState *bs)
2000{
2001 bool bs_busy;
2002
2003 bdrv_flush_io_queue(bs);
2004 bdrv_start_throttled_reqs(bs);
2005 bs_busy = bdrv_requests_pending(bs);
2006 bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
2007 return bs_busy;
2008}
2009
2010/*
2011 * Wait for pending requests to complete on a single BlockDriverState subtree
2012 *
2013 * See the warning in bdrv_drain_all(). This function can only be called if
2014 * you are sure nothing can generate I/O because you have op blockers
2015 * installed.
2016 *
2017 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
2018 * AioContext.
2019 */
2020void bdrv_drain(BlockDriverState *bs)
2021{
2022 while (bdrv_drain_one(bs)) {
2023 /* Keep iterating */
2024 }
2025}
2026
922453bc
SH
2027/*
2028 * Wait for pending requests to complete across all BlockDriverStates
2029 *
2030 * This function does not flush data to disk, use bdrv_flush_all() for that
2031 * after calling this function.
4c355d53
ZYW
2032 *
2033 * Note that completion of an asynchronous I/O operation can trigger any
2034 * number of other I/O operations on other devices---for example a coroutine
2035 * can be arbitrarily complex and a constant flow of I/O can come until the
2036 * coroutine is complete. Because of this, it is not possible to have a
2037 * function to drain a single device's I/O queue.
922453bc
SH
2038 */
2039void bdrv_drain_all(void)
2040{
88266f5a
SH
2041 /* Always run first iteration so any pending completion BHs run */
2042 bool busy = true;
922453bc
SH
2043 BlockDriverState *bs;
2044
69da3b0b
FZ
2045 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2046 AioContext *aio_context = bdrv_get_aio_context(bs);
2047
2048 aio_context_acquire(aio_context);
2049 if (bs->job) {
2050 block_job_pause(bs->job);
2051 }
2052 aio_context_release(aio_context);
2053 }
2054
88266f5a 2055 while (busy) {
9b536adc
SH
2056 busy = false;
2057
dc364f4c 2058 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
9b536adc 2059 AioContext *aio_context = bdrv_get_aio_context(bs);
9b536adc
SH
2060
2061 aio_context_acquire(aio_context);
5b98db0a 2062 busy |= bdrv_drain_one(bs);
9b536adc 2063 aio_context_release(aio_context);
9b536adc 2064 }
922453bc 2065 }
69da3b0b
FZ
2066
2067 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2068 AioContext *aio_context = bdrv_get_aio_context(bs);
2069
2070 aio_context_acquire(aio_context);
2071 if (bs->job) {
2072 block_job_resume(bs->job);
2073 }
2074 aio_context_release(aio_context);
2075 }
922453bc
SH
2076}
2077
dc364f4c
BC
2078/* make a BlockDriverState anonymous by removing from bdrv_state and
2079 * graph_bdrv_state list.
d22b2f41
RH
2080 Also, NULL terminate the device_name to prevent double remove */
2081void bdrv_make_anon(BlockDriverState *bs)
2082{
bfb197e0
MA
2083 /*
2084 * Take care to remove bs from bdrv_states only when it's actually
2085 * in it. Note that bs->device_list.tqe_prev is initially null,
2086 * and gets set to non-null by QTAILQ_INSERT_TAIL(). Establish
2087 * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
2088 * resetting it to null on remove.
2089 */
2090 if (bs->device_list.tqe_prev) {
dc364f4c 2091 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
bfb197e0 2092 bs->device_list.tqe_prev = NULL;
d22b2f41 2093 }
dc364f4c
BC
2094 if (bs->node_name[0] != '\0') {
2095 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
2096 }
2097 bs->node_name[0] = '\0';
d22b2f41
RH
2098}
2099
e023b2e2
PB
2100static void bdrv_rebind(BlockDriverState *bs)
2101{
2102 if (bs->drv && bs->drv->bdrv_rebind) {
2103 bs->drv->bdrv_rebind(bs);
2104 }
2105}
2106
4ddc07ca
PB
2107static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
2108 BlockDriverState *bs_src)
8802d1fd 2109{
4ddc07ca 2110 /* move some fields that need to stay attached to the device */
8802d1fd
JC
2111
2112 /* dev info */
1b7fd729 2113 bs_dest->guest_block_size = bs_src->guest_block_size;
4ddc07ca 2114 bs_dest->copy_on_read = bs_src->copy_on_read;
8802d1fd 2115
4ddc07ca 2116 bs_dest->enable_write_cache = bs_src->enable_write_cache;
c4a248a1 2117
cc0681c4
BC
2118 /* i/o throttled req */
2119 memcpy(&bs_dest->throttle_state,
2120 &bs_src->throttle_state,
2121 sizeof(ThrottleState));
2122 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
2123 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
4ddc07ca 2124 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
8802d1fd 2125
8802d1fd 2126 /* r/w error */
4ddc07ca
PB
2127 bs_dest->on_read_error = bs_src->on_read_error;
2128 bs_dest->on_write_error = bs_src->on_write_error;
8802d1fd
JC
2129
2130 /* i/o status */
4ddc07ca
PB
2131 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
2132 bs_dest->iostatus = bs_src->iostatus;
8802d1fd 2133
a9fc4408 2134 /* dirty bitmap */
e4654d2d 2135 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
a9fc4408 2136
9fcb0251
FZ
2137 /* reference count */
2138 bs_dest->refcnt = bs_src->refcnt;
2139
a9fc4408 2140 /* job */
4ddc07ca 2141 bs_dest->job = bs_src->job;
a9fc4408 2142
8802d1fd 2143 /* keep the same entry in bdrv_states */
dc364f4c 2144 bs_dest->device_list = bs_src->device_list;
7e7d56d9
MA
2145 bs_dest->blk = bs_src->blk;
2146
fbe40ff7
FZ
2147 memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2148 sizeof(bs_dest->op_blockers));
4ddc07ca 2149}
8802d1fd 2150
4ddc07ca
PB
2151/*
2152 * Swap bs contents for two image chains while they are live,
2153 * while keeping required fields on the BlockDriverState that is
2154 * actually attached to a device.
2155 *
2156 * This will modify the BlockDriverState fields, and swap contents
2157 * between bs_new and bs_old. Both bs_new and bs_old are modified.
2158 *
bfb197e0 2159 * bs_new must not be attached to a BlockBackend.
4ddc07ca
PB
2160 *
2161 * This function does not create any image files.
2162 */
2163void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2164{
2165 BlockDriverState tmp;
f6801b83 2166
90ce8a06
BC
2167 /* The code needs to swap the node_name but simply swapping node_list won't
2168 * work so first remove the nodes from the graph list, do the swap then
2169 * insert them back if needed.
2170 */
2171 if (bs_new->node_name[0] != '\0') {
2172 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2173 }
2174 if (bs_old->node_name[0] != '\0') {
2175 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2176 }
2177
bfb197e0 2178 /* bs_new must be unattached and shouldn't have anything fancy enabled */
7e7d56d9 2179 assert(!bs_new->blk);
e4654d2d 2180 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
4ddc07ca 2181 assert(bs_new->job == NULL);
4ddc07ca 2182 assert(bs_new->io_limits_enabled == false);
cc0681c4 2183 assert(!throttle_have_timer(&bs_new->throttle_state));
8802d1fd 2184
4ddc07ca
PB
2185 tmp = *bs_new;
2186 *bs_new = *bs_old;
2187 *bs_old = tmp;
a9fc4408 2188
4ddc07ca
PB
2189 /* there are some fields that should not be swapped, move them back */
2190 bdrv_move_feature_fields(&tmp, bs_old);
2191 bdrv_move_feature_fields(bs_old, bs_new);
2192 bdrv_move_feature_fields(bs_new, &tmp);
8802d1fd 2193
bfb197e0 2194 /* bs_new must remain unattached */
7e7d56d9 2195 assert(!bs_new->blk);
4ddc07ca
PB
2196
2197 /* Check a few fields that should remain attached to the device */
4ddc07ca 2198 assert(bs_new->job == NULL);
4ddc07ca 2199 assert(bs_new->io_limits_enabled == false);
cc0681c4 2200 assert(!throttle_have_timer(&bs_new->throttle_state));
e023b2e2 2201
90ce8a06
BC
2202 /* insert the nodes back into the graph node list if needed */
2203 if (bs_new->node_name[0] != '\0') {
2204 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2205 }
2206 if (bs_old->node_name[0] != '\0') {
2207 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2208 }
2209
e023b2e2 2210 bdrv_rebind(bs_new);
4ddc07ca
PB
2211 bdrv_rebind(bs_old);
2212}
2213
2214/*
2215 * Add new bs contents at the top of an image chain while the chain is
2216 * live, while keeping required fields on the top layer.
2217 *
2218 * This will modify the BlockDriverState fields, and swap contents
2219 * between bs_new and bs_top. Both bs_new and bs_top are modified.
2220 *
bfb197e0 2221 * bs_new must not be attached to a BlockBackend.
4ddc07ca
PB
2222 *
2223 * This function does not create any image files.
2224 */
2225void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2226{
2227 bdrv_swap(bs_new, bs_top);
2228
2229 /* The contents of 'tmp' will become bs_top, as we are
2230 * swapping bs_new and bs_top contents. */
8d24cce1 2231 bdrv_set_backing_hd(bs_top, bs_new);
8802d1fd
JC
2232}
2233
4f6fd349 2234static void bdrv_delete(BlockDriverState *bs)
b338082b 2235{
3e914655 2236 assert(!bs->job);
3718d8ab 2237 assert(bdrv_op_blocker_is_empty(bs));
4f6fd349 2238 assert(!bs->refcnt);
e4654d2d 2239 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
18846dee 2240
e1b5c52e
SH
2241 bdrv_close(bs);
2242
1b7bdbc1 2243 /* remove from list, if necessary */
d22b2f41 2244 bdrv_make_anon(bs);
34c6f050 2245
7267c094 2246 g_free(bs);
fc01f7e7
FB
2247}
2248
e97fc193
AL
2249/*
2250 * Run consistency checks on an image
2251 *
e076f338 2252 * Returns 0 if the check could be completed (it doesn't mean that the image is
a1c7273b 2253 * free of errors) or -errno when an internal error occurred. The results of the
e076f338 2254 * check are stored in res.
e97fc193 2255 */
4534ff54 2256int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
e97fc193 2257{
908bcd54
HR
2258 if (bs->drv == NULL) {
2259 return -ENOMEDIUM;
2260 }
e97fc193
AL
2261 if (bs->drv->bdrv_check == NULL) {
2262 return -ENOTSUP;
2263 }
2264
e076f338 2265 memset(res, 0, sizeof(*res));
4534ff54 2266 return bs->drv->bdrv_check(bs, res, fix);
e97fc193
AL
2267}
2268
8a426614
KW
2269#define COMMIT_BUF_SECTORS 2048
2270
33e3963e
FB
2271/* commit COW file into the raw image */
2272int bdrv_commit(BlockDriverState *bs)
2273{
19cb3738 2274 BlockDriver *drv = bs->drv;
72706ea4 2275 int64_t sector, total_sectors, length, backing_length;
8a426614 2276 int n, ro, open_flags;
0bce597d 2277 int ret = 0;
72706ea4 2278 uint8_t *buf = NULL;
33e3963e 2279
19cb3738
FB
2280 if (!drv)
2281 return -ENOMEDIUM;
6bb45158 2282
4dca4b63
NS
2283 if (!bs->backing_hd) {
2284 return -ENOTSUP;
33e3963e
FB
2285 }
2286
bb00021d
FZ
2287 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT_SOURCE, NULL) ||
2288 bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET, NULL)) {
2d3735d3
SH
2289 return -EBUSY;
2290 }
2291
4dca4b63 2292 ro = bs->backing_hd->read_only;
4dca4b63
NS
2293 open_flags = bs->backing_hd->open_flags;
2294
2295 if (ro) {
0bce597d
JC
2296 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2297 return -EACCES;
4dca4b63 2298 }
ea2384d3 2299 }
33e3963e 2300
72706ea4
JC
2301 length = bdrv_getlength(bs);
2302 if (length < 0) {
2303 ret = length;
2304 goto ro_cleanup;
2305 }
2306
2307 backing_length = bdrv_getlength(bs->backing_hd);
2308 if (backing_length < 0) {
2309 ret = backing_length;
2310 goto ro_cleanup;
2311 }
2312
2313 /* If our top snapshot is larger than the backing file image,
2314 * grow the backing file image if possible. If not possible,
2315 * we must return an error */
2316 if (length > backing_length) {
2317 ret = bdrv_truncate(bs->backing_hd, length);
2318 if (ret < 0) {
2319 goto ro_cleanup;
2320 }
2321 }
2322
2323 total_sectors = length >> BDRV_SECTOR_BITS;
857d4f46
KW
2324
2325 /* qemu_try_blockalign() for bs will choose an alignment that works for
2326 * bs->backing_hd as well, so no need to compare the alignment manually. */
2327 buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2328 if (buf == NULL) {
2329 ret = -ENOMEM;
2330 goto ro_cleanup;
2331 }
8a426614
KW
2332
2333 for (sector = 0; sector < total_sectors; sector += n) {
d663640c
PB
2334 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2335 if (ret < 0) {
2336 goto ro_cleanup;
2337 }
2338 if (ret) {
dabfa6cc
KW
2339 ret = bdrv_read(bs, sector, buf, n);
2340 if (ret < 0) {
8a426614
KW
2341 goto ro_cleanup;
2342 }
2343
dabfa6cc
KW
2344 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2345 if (ret < 0) {
8a426614
KW
2346 goto ro_cleanup;
2347 }
ea2384d3 2348 }
33e3963e 2349 }
95389c86 2350
1d44952f
CH
2351 if (drv->bdrv_make_empty) {
2352 ret = drv->bdrv_make_empty(bs);
dabfa6cc
KW
2353 if (ret < 0) {
2354 goto ro_cleanup;
2355 }
1d44952f
CH
2356 bdrv_flush(bs);
2357 }
95389c86 2358
3f5075ae
CH
2359 /*
2360 * Make sure all data we wrote to the backing device is actually
2361 * stable on disk.
2362 */
dabfa6cc 2363 if (bs->backing_hd) {
3f5075ae 2364 bdrv_flush(bs->backing_hd);
dabfa6cc 2365 }
4dca4b63 2366
dabfa6cc 2367 ret = 0;
4dca4b63 2368ro_cleanup:
857d4f46 2369 qemu_vfree(buf);
4dca4b63
NS
2370
2371 if (ro) {
0bce597d
JC
2372 /* ignoring error return here */
2373 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
4dca4b63
NS
2374 }
2375
1d44952f 2376 return ret;
33e3963e
FB
2377}
2378
e8877497 2379int bdrv_commit_all(void)
6ab4b5ab
MA
2380{
2381 BlockDriverState *bs;
2382
dc364f4c 2383 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
2384 AioContext *aio_context = bdrv_get_aio_context(bs);
2385
2386 aio_context_acquire(aio_context);
272d2d8e
JC
2387 if (bs->drv && bs->backing_hd) {
2388 int ret = bdrv_commit(bs);
2389 if (ret < 0) {
ed78cda3 2390 aio_context_release(aio_context);
272d2d8e
JC
2391 return ret;
2392 }
e8877497 2393 }
ed78cda3 2394 aio_context_release(aio_context);
6ab4b5ab 2395 }
e8877497 2396 return 0;
6ab4b5ab
MA
2397}
2398
dbffbdcf
SH
2399/**
2400 * Remove an active request from the tracked requests list
2401 *
2402 * This function should be called when a tracked request is completing.
2403 */
2404static void tracked_request_end(BdrvTrackedRequest *req)
2405{
2dbafdc0
KW
2406 if (req->serialising) {
2407 req->bs->serialising_in_flight--;
2408 }
2409
dbffbdcf 2410 QLIST_REMOVE(req, list);
f4658285 2411 qemu_co_queue_restart_all(&req->wait_queue);
dbffbdcf
SH
2412}
2413
2414/**
2415 * Add an active request to the tracked requests list
2416 */
2417static void tracked_request_begin(BdrvTrackedRequest *req,
2418 BlockDriverState *bs,
793ed47a
KW
2419 int64_t offset,
2420 unsigned int bytes, bool is_write)
dbffbdcf
SH
2421{
2422 *req = (BdrvTrackedRequest){
2423 .bs = bs,
2dbafdc0
KW
2424 .offset = offset,
2425 .bytes = bytes,
2426 .is_write = is_write,
2427 .co = qemu_coroutine_self(),
2428 .serialising = false,
7327145f
KW
2429 .overlap_offset = offset,
2430 .overlap_bytes = bytes,
dbffbdcf
SH
2431 };
2432
f4658285
SH
2433 qemu_co_queue_init(&req->wait_queue);
2434
dbffbdcf
SH
2435 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2436}
2437
e96126ff 2438static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2dbafdc0 2439{
7327145f 2440 int64_t overlap_offset = req->offset & ~(align - 1);
e96126ff
KW
2441 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2442 - overlap_offset;
7327145f 2443
2dbafdc0
KW
2444 if (!req->serialising) {
2445 req->bs->serialising_in_flight++;
2446 req->serialising = true;
2447 }
7327145f
KW
2448
2449 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2450 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2dbafdc0
KW
2451}
2452
d83947ac
SH
2453/**
2454 * Round a region to cluster boundaries
2455 */
343bded4
PB
2456void bdrv_round_to_clusters(BlockDriverState *bs,
2457 int64_t sector_num, int nb_sectors,
2458 int64_t *cluster_sector_num,
2459 int *cluster_nb_sectors)
d83947ac
SH
2460{
2461 BlockDriverInfo bdi;
2462
2463 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2464 *cluster_sector_num = sector_num;
2465 *cluster_nb_sectors = nb_sectors;
2466 } else {
2467 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2468 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2469 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2470 nb_sectors, c);
2471 }
2472}
2473
7327145f 2474static int bdrv_get_cluster_size(BlockDriverState *bs)
793ed47a
KW
2475{
2476 BlockDriverInfo bdi;
7327145f 2477 int ret;
793ed47a 2478
7327145f
KW
2479 ret = bdrv_get_info(bs, &bdi);
2480 if (ret < 0 || bdi.cluster_size == 0) {
2481 return bs->request_alignment;
793ed47a 2482 } else {
7327145f 2483 return bdi.cluster_size;
793ed47a
KW
2484 }
2485}
2486
f4658285 2487static bool tracked_request_overlaps(BdrvTrackedRequest *req,
793ed47a
KW
2488 int64_t offset, unsigned int bytes)
2489{
d83947ac 2490 /* aaaa bbbb */
7327145f 2491 if (offset >= req->overlap_offset + req->overlap_bytes) {
d83947ac
SH
2492 return false;
2493 }
2494 /* bbbb aaaa */
7327145f 2495 if (req->overlap_offset >= offset + bytes) {
d83947ac
SH
2496 return false;
2497 }
2498 return true;
f4658285
SH
2499}
2500
28de2dcd 2501static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
f4658285 2502{
2dbafdc0 2503 BlockDriverState *bs = self->bs;
f4658285
SH
2504 BdrvTrackedRequest *req;
2505 bool retry;
28de2dcd 2506 bool waited = false;
f4658285 2507
2dbafdc0 2508 if (!bs->serialising_in_flight) {
28de2dcd 2509 return false;
2dbafdc0
KW
2510 }
2511
f4658285
SH
2512 do {
2513 retry = false;
2514 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2dbafdc0 2515 if (req == self || (!req->serialising && !self->serialising)) {
65afd211
KW
2516 continue;
2517 }
7327145f
KW
2518 if (tracked_request_overlaps(req, self->overlap_offset,
2519 self->overlap_bytes))
2520 {
5f8b6491
SH
2521 /* Hitting this means there was a reentrant request, for
2522 * example, a block driver issuing nested requests. This must
2523 * never happen since it means deadlock.
2524 */
2525 assert(qemu_coroutine_self() != req->co);
2526
6460440f
KW
2527 /* If the request is already (indirectly) waiting for us, or
2528 * will wait for us as soon as it wakes up, then just go on
2529 * (instead of producing a deadlock in the former case). */
2530 if (!req->waiting_for) {
2531 self->waiting_for = req;
2532 qemu_co_queue_wait(&req->wait_queue);
2533 self->waiting_for = NULL;
2534 retry = true;
28de2dcd 2535 waited = true;
6460440f
KW
2536 break;
2537 }
f4658285
SH
2538 }
2539 }
2540 } while (retry);
28de2dcd
KW
2541
2542 return waited;
f4658285
SH
2543}
2544
756e6736
KW
2545/*
2546 * Return values:
2547 * 0 - success
2548 * -EINVAL - backing format specified, but no file
2549 * -ENOSPC - can't update the backing file because no space is left in the
2550 * image file header
2551 * -ENOTSUP - format driver doesn't support changing the backing file
2552 */
2553int bdrv_change_backing_file(BlockDriverState *bs,
2554 const char *backing_file, const char *backing_fmt)
2555{
2556 BlockDriver *drv = bs->drv;
469ef350 2557 int ret;
756e6736 2558
5f377794
PB
2559 /* Backing file format doesn't make sense without a backing file */
2560 if (backing_fmt && !backing_file) {
2561 return -EINVAL;
2562 }
2563
756e6736 2564 if (drv->bdrv_change_backing_file != NULL) {
469ef350 2565 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
756e6736 2566 } else {
469ef350 2567 ret = -ENOTSUP;
756e6736 2568 }
469ef350
PB
2569
2570 if (ret == 0) {
2571 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2572 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2573 }
2574 return ret;
756e6736
KW
2575}
2576
6ebdcee2
JC
2577/*
2578 * Finds the image layer in the chain that has 'bs' as its backing file.
2579 *
2580 * active is the current topmost image.
2581 *
2582 * Returns NULL if bs is not found in active's image chain,
2583 * or if active == bs.
4caf0fcd
JC
2584 *
2585 * Returns the bottommost base image if bs == NULL.
6ebdcee2
JC
2586 */
2587BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2588 BlockDriverState *bs)
2589{
4caf0fcd
JC
2590 while (active && bs != active->backing_hd) {
2591 active = active->backing_hd;
6ebdcee2
JC
2592 }
2593
4caf0fcd
JC
2594 return active;
2595}
6ebdcee2 2596
4caf0fcd
JC
2597/* Given a BDS, searches for the base layer. */
2598BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2599{
2600 return bdrv_find_overlay(bs, NULL);
6ebdcee2
JC
2601}
2602
2603typedef struct BlkIntermediateStates {
2604 BlockDriverState *bs;
2605 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2606} BlkIntermediateStates;
2607
2608
2609/*
2610 * Drops images above 'base' up to and including 'top', and sets the image
2611 * above 'top' to have base as its backing file.
2612 *
2613 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2614 * information in 'bs' can be properly updated.
2615 *
2616 * E.g., this will convert the following chain:
2617 * bottom <- base <- intermediate <- top <- active
2618 *
2619 * to
2620 *
2621 * bottom <- base <- active
2622 *
2623 * It is allowed for bottom==base, in which case it converts:
2624 *
2625 * base <- intermediate <- top <- active
2626 *
2627 * to
2628 *
2629 * base <- active
2630 *
54e26900
JC
2631 * If backing_file_str is non-NULL, it will be used when modifying top's
2632 * overlay image metadata.
2633 *
6ebdcee2
JC
2634 * Error conditions:
2635 * if active == top, that is considered an error
2636 *
2637 */
2638int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
54e26900 2639 BlockDriverState *base, const char *backing_file_str)
6ebdcee2
JC
2640{
2641 BlockDriverState *intermediate;
2642 BlockDriverState *base_bs = NULL;
2643 BlockDriverState *new_top_bs = NULL;
2644 BlkIntermediateStates *intermediate_state, *next;
2645 int ret = -EIO;
2646
2647 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2648 QSIMPLEQ_INIT(&states_to_delete);
2649
2650 if (!top->drv || !base->drv) {
2651 goto exit;
2652 }
2653
2654 new_top_bs = bdrv_find_overlay(active, top);
2655
2656 if (new_top_bs == NULL) {
2657 /* we could not find the image above 'top', this is an error */
2658 goto exit;
2659 }
2660
2661 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2662 * to do, no intermediate images */
2663 if (new_top_bs->backing_hd == base) {
2664 ret = 0;
2665 goto exit;
2666 }
2667
2668 intermediate = top;
2669
2670 /* now we will go down through the list, and add each BDS we find
2671 * into our deletion queue, until we hit the 'base'
2672 */
2673 while (intermediate) {
5839e53b 2674 intermediate_state = g_new0(BlkIntermediateStates, 1);
6ebdcee2
JC
2675 intermediate_state->bs = intermediate;
2676 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2677
2678 if (intermediate->backing_hd == base) {
2679 base_bs = intermediate->backing_hd;
2680 break;
2681 }
2682 intermediate = intermediate->backing_hd;
2683 }
2684 if (base_bs == NULL) {
2685 /* something went wrong, we did not end at the base. safely
2686 * unravel everything, and exit with error */
2687 goto exit;
2688 }
2689
2690 /* success - we can delete the intermediate states, and link top->base */
54e26900
JC
2691 backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2692 ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
6ebdcee2
JC
2693 base_bs->drv ? base_bs->drv->format_name : "");
2694 if (ret) {
2695 goto exit;
2696 }
920beae1 2697 bdrv_set_backing_hd(new_top_bs, base_bs);
6ebdcee2
JC
2698
2699 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2700 /* so that bdrv_close() does not recursively close the chain */
920beae1 2701 bdrv_set_backing_hd(intermediate_state->bs, NULL);
4f6fd349 2702 bdrv_unref(intermediate_state->bs);
6ebdcee2
JC
2703 }
2704 ret = 0;
2705
2706exit:
2707 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2708 g_free(intermediate_state);
2709 }
2710 return ret;
2711}
2712
2713
71d0770c
AL
2714static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2715 size_t size)
2716{
75af1f34 2717 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
1dd3a447
KW
2718 return -EIO;
2719 }
2720
c0191e76 2721 if (!bdrv_is_inserted(bs)) {
71d0770c 2722 return -ENOMEDIUM;
c0191e76 2723 }
71d0770c 2724
c0191e76 2725 if (offset < 0) {
71d0770c 2726 return -EIO;
c0191e76 2727 }
71d0770c
AL
2728
2729 return 0;
2730}
2731
2732static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2733 int nb_sectors)
2734{
75af1f34 2735 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
8f4754ed
KW
2736 return -EIO;
2737 }
2738
eb5a3165
JS
2739 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2740 nb_sectors * BDRV_SECTOR_SIZE);
71d0770c
AL
2741}
2742
1c9805a3
SH
2743typedef struct RwCo {
2744 BlockDriverState *bs;
775aa8b6 2745 int64_t offset;
1c9805a3
SH
2746 QEMUIOVector *qiov;
2747 bool is_write;
2748 int ret;
4105eaaa 2749 BdrvRequestFlags flags;
1c9805a3
SH
2750} RwCo;
2751
2752static void coroutine_fn bdrv_rw_co_entry(void *opaque)
fc01f7e7 2753{
1c9805a3 2754 RwCo *rwco = opaque;
ea2384d3 2755
1c9805a3 2756 if (!rwco->is_write) {
775aa8b6
KW
2757 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2758 rwco->qiov->size, rwco->qiov,
4105eaaa 2759 rwco->flags);
775aa8b6
KW
2760 } else {
2761 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2762 rwco->qiov->size, rwco->qiov,
2763 rwco->flags);
1c9805a3
SH
2764 }
2765}
e7a8a783 2766
1c9805a3 2767/*
8d3b1a2d 2768 * Process a vectored synchronous request using coroutines
1c9805a3 2769 */
775aa8b6
KW
2770static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2771 QEMUIOVector *qiov, bool is_write,
2772 BdrvRequestFlags flags)
1c9805a3 2773{
1c9805a3
SH
2774 Coroutine *co;
2775 RwCo rwco = {
2776 .bs = bs,
775aa8b6 2777 .offset = offset,
8d3b1a2d 2778 .qiov = qiov,
1c9805a3
SH
2779 .is_write = is_write,
2780 .ret = NOT_DONE,
4105eaaa 2781 .flags = flags,
1c9805a3 2782 };
e7a8a783 2783
498e386c
ZYW
2784 /**
2785 * In sync call context, when the vcpu is blocked, this throttling timer
2786 * will not fire; so the I/O throttling function has to be disabled here
2787 * if it has been enabled.
2788 */
2789 if (bs->io_limits_enabled) {
2790 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2791 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2792 bdrv_io_limits_disable(bs);
2793 }
2794
1c9805a3
SH
2795 if (qemu_in_coroutine()) {
2796 /* Fast-path if already in coroutine context */
2797 bdrv_rw_co_entry(&rwco);
2798 } else {
2572b37a
SH
2799 AioContext *aio_context = bdrv_get_aio_context(bs);
2800
1c9805a3
SH
2801 co = qemu_coroutine_create(bdrv_rw_co_entry);
2802 qemu_coroutine_enter(co, &rwco);
2803 while (rwco.ret == NOT_DONE) {
2572b37a 2804 aio_poll(aio_context, true);
1c9805a3
SH
2805 }
2806 }
2807 return rwco.ret;
2808}
b338082b 2809
8d3b1a2d
KW
2810/*
2811 * Process a synchronous request using coroutines
2812 */
2813static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
4105eaaa 2814 int nb_sectors, bool is_write, BdrvRequestFlags flags)
8d3b1a2d
KW
2815{
2816 QEMUIOVector qiov;
2817 struct iovec iov = {
2818 .iov_base = (void *)buf,
2819 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2820 };
2821
75af1f34 2822 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
da15ee51
KW
2823 return -EINVAL;
2824 }
2825
8d3b1a2d 2826 qemu_iovec_init_external(&qiov, &iov, 1);
775aa8b6
KW
2827 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2828 &qiov, is_write, flags);
8d3b1a2d
KW
2829}
2830
1c9805a3
SH
2831/* return < 0 if error. See bdrv_write() for the return codes */
2832int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2833 uint8_t *buf, int nb_sectors)
2834{
4105eaaa 2835 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
fc01f7e7
FB
2836}
2837
07d27a44
MA
2838/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2839int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2840 uint8_t *buf, int nb_sectors)
2841{
2842 bool enabled;
2843 int ret;
2844
2845 enabled = bs->io_limits_enabled;
2846 bs->io_limits_enabled = false;
4e7395e8 2847 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
07d27a44
MA
2848 bs->io_limits_enabled = enabled;
2849 return ret;
2850}
2851
5fafdf24 2852/* Return < 0 if error. Important errors are:
19cb3738
FB
2853 -EIO generic I/O error (may happen for all errors)
2854 -ENOMEDIUM No media inserted.
2855 -EINVAL Invalid sector number or nb_sectors
2856 -EACCES Trying to write a read-only device
2857*/
5fafdf24 2858int bdrv_write(BlockDriverState *bs, int64_t sector_num,
fc01f7e7
FB
2859 const uint8_t *buf, int nb_sectors)
2860{
4105eaaa 2861 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
83f64091
FB
2862}
2863
aa7bfbff
PL
2864int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2865 int nb_sectors, BdrvRequestFlags flags)
4105eaaa
PL
2866{
2867 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
aa7bfbff 2868 BDRV_REQ_ZERO_WRITE | flags);
8d3b1a2d
KW
2869}
2870
d75cbb5e
PL
2871/*
2872 * Completely zero out a block device with the help of bdrv_write_zeroes.
2873 * The operation is sped up by checking the block status and only writing
2874 * zeroes to the device if they currently do not return zeroes. Optional
2875 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2876 *
2877 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2878 */
2879int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2880{
d32f7c10 2881 int64_t target_sectors, ret, nb_sectors, sector_num = 0;
d75cbb5e
PL
2882 int n;
2883
d32f7c10
MA
2884 target_sectors = bdrv_nb_sectors(bs);
2885 if (target_sectors < 0) {
2886 return target_sectors;
9ce10c0b 2887 }
9ce10c0b 2888
d75cbb5e 2889 for (;;) {
75af1f34 2890 nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
d75cbb5e
PL
2891 if (nb_sectors <= 0) {
2892 return 0;
2893 }
d75cbb5e 2894 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
3d94ce60
PL
2895 if (ret < 0) {
2896 error_report("error getting block status at sector %" PRId64 ": %s",
2897 sector_num, strerror(-ret));
2898 return ret;
2899 }
d75cbb5e
PL
2900 if (ret & BDRV_BLOCK_ZERO) {
2901 sector_num += n;
2902 continue;
2903 }
2904 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2905 if (ret < 0) {
2906 error_report("error writing zeroes at sector %" PRId64 ": %s",
2907 sector_num, strerror(-ret));
2908 return ret;
2909 }
2910 sector_num += n;
2911 }
2912}
2913
a3ef6571 2914int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
83f64091 2915{
a3ef6571
KW
2916 QEMUIOVector qiov;
2917 struct iovec iov = {
2918 .iov_base = (void *)buf,
2919 .iov_len = bytes,
2920 };
9a8c4cce 2921 int ret;
83f64091 2922
a3ef6571
KW
2923 if (bytes < 0) {
2924 return -EINVAL;
83f64091
FB
2925 }
2926
a3ef6571
KW
2927 qemu_iovec_init_external(&qiov, &iov, 1);
2928 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2929 if (ret < 0) {
2930 return ret;
83f64091 2931 }
a3ef6571
KW
2932
2933 return bytes;
83f64091
FB
2934}
2935
8d3b1a2d 2936int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
83f64091 2937{
9a8c4cce 2938 int ret;
83f64091 2939
8407d5d7
KW
2940 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2941 if (ret < 0) {
2942 return ret;
83f64091
FB
2943 }
2944
8d3b1a2d
KW
2945 return qiov->size;
2946}
2947
2948int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
8407d5d7 2949 const void *buf, int bytes)
8d3b1a2d
KW
2950{
2951 QEMUIOVector qiov;
2952 struct iovec iov = {
2953 .iov_base = (void *) buf,
8407d5d7 2954 .iov_len = bytes,
8d3b1a2d
KW
2955 };
2956
8407d5d7
KW
2957 if (bytes < 0) {
2958 return -EINVAL;
2959 }
2960
8d3b1a2d
KW
2961 qemu_iovec_init_external(&qiov, &iov, 1);
2962 return bdrv_pwritev(bs, offset, &qiov);
83f64091 2963}
83f64091 2964
f08145fe
KW
2965/*
2966 * Writes to the file and ensures that no writes are reordered across this
2967 * request (acts as a barrier)
2968 *
2969 * Returns 0 on success, -errno in error cases.
2970 */
2971int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2972 const void *buf, int count)
2973{
2974 int ret;
2975
2976 ret = bdrv_pwrite(bs, offset, buf, count);
2977 if (ret < 0) {
2978 return ret;
2979 }
2980
f05fa4ad
PB
2981 /* No flush needed for cache modes that already do it */
2982 if (bs->enable_write_cache) {
f08145fe
KW
2983 bdrv_flush(bs);
2984 }
2985
2986 return 0;
2987}
2988
470c0504 2989static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
ab185921
SH
2990 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2991{
2992 /* Perform I/O through a temporary buffer so that users who scribble over
2993 * their read buffer while the operation is in progress do not end up
2994 * modifying the image file. This is critical for zero-copy guest I/O
2995 * where anything might happen inside guest memory.
2996 */
2997 void *bounce_buffer;
2998
79c053bd 2999 BlockDriver *drv = bs->drv;
ab185921
SH
3000 struct iovec iov;
3001 QEMUIOVector bounce_qiov;
3002 int64_t cluster_sector_num;
3003 int cluster_nb_sectors;
3004 size_t skip_bytes;
3005 int ret;
3006
3007 /* Cover entire cluster so no additional backing file I/O is required when
3008 * allocating cluster in the image file.
3009 */
343bded4
PB
3010 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
3011 &cluster_sector_num, &cluster_nb_sectors);
ab185921 3012
470c0504
SH
3013 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
3014 cluster_sector_num, cluster_nb_sectors);
ab185921
SH
3015
3016 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
857d4f46
KW
3017 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
3018 if (bounce_buffer == NULL) {
3019 ret = -ENOMEM;
3020 goto err;
3021 }
3022
ab185921
SH
3023 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
3024
79c053bd
SH
3025 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
3026 &bounce_qiov);
ab185921
SH
3027 if (ret < 0) {
3028 goto err;
3029 }
3030
79c053bd
SH
3031 if (drv->bdrv_co_write_zeroes &&
3032 buffer_is_zero(bounce_buffer, iov.iov_len)) {
621f0589 3033 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
aa7bfbff 3034 cluster_nb_sectors, 0);
79c053bd 3035 } else {
f05fa4ad
PB
3036 /* This does not change the data on the disk, it is not necessary
3037 * to flush even in cache=writethrough mode.
3038 */
79c053bd 3039 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
ab185921 3040 &bounce_qiov);
79c053bd
SH
3041 }
3042
ab185921
SH
3043 if (ret < 0) {
3044 /* It might be okay to ignore write errors for guest requests. If this
3045 * is a deliberate copy-on-read then we don't want to ignore the error.
3046 * Simply report it in all cases.
3047 */
3048 goto err;
3049 }
3050
3051 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
03396148
MT
3052 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
3053 nb_sectors * BDRV_SECTOR_SIZE);
ab185921
SH
3054
3055err:
3056 qemu_vfree(bounce_buffer);
3057 return ret;
3058}
3059
c5fbe571 3060/*
d0c7f642
KW
3061 * Forwards an already correctly aligned request to the BlockDriver. This
3062 * handles copy on read and zeroing after EOF; any other features must be
3063 * implemented by the caller.
c5fbe571 3064 */
d0c7f642 3065static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
65afd211 3066 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
ec746e10 3067 int64_t align, QEMUIOVector *qiov, int flags)
da1fa91d
KW
3068{
3069 BlockDriver *drv = bs->drv;
dbffbdcf 3070 int ret;
da1fa91d 3071
d0c7f642
KW
3072 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3073 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
da1fa91d 3074
d0c7f642
KW
3075 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3076 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
8eb029c2 3077 assert(!qiov || bytes == qiov->size);
d0c7f642
KW
3078
3079 /* Handle Copy on Read and associated serialisation */
470c0504 3080 if (flags & BDRV_REQ_COPY_ON_READ) {
7327145f
KW
3081 /* If we touch the same cluster it counts as an overlap. This
3082 * guarantees that allocating writes will be serialized and not race
3083 * with each other for the same cluster. For example, in copy-on-read
3084 * it ensures that the CoR read and write operations are atomic and
3085 * guest writes cannot interleave between them. */
3086 mark_request_serialising(req, bdrv_get_cluster_size(bs));
470c0504
SH
3087 }
3088
2dbafdc0 3089 wait_serialising_requests(req);
f4658285 3090
470c0504 3091 if (flags & BDRV_REQ_COPY_ON_READ) {
ab185921
SH
3092 int pnum;
3093
bdad13b9 3094 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
ab185921
SH
3095 if (ret < 0) {
3096 goto out;
3097 }
3098
3099 if (!ret || pnum != nb_sectors) {
470c0504 3100 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
ab185921
SH
3101 goto out;
3102 }
3103 }
3104
d0c7f642 3105 /* Forward the request to the BlockDriver */
c0191e76 3106 if (!bs->zero_beyond_eof) {
893a8f62
MK
3107 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3108 } else {
c0191e76 3109 /* Read zeros after EOF */
4049082c 3110 int64_t total_sectors, max_nb_sectors;
893a8f62 3111
4049082c
MA
3112 total_sectors = bdrv_nb_sectors(bs);
3113 if (total_sectors < 0) {
3114 ret = total_sectors;
893a8f62
MK
3115 goto out;
3116 }
3117
5f5bcd80
KW
3118 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3119 align >> BDRV_SECTOR_BITS);
e012b78c
PB
3120 if (nb_sectors < max_nb_sectors) {
3121 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3122 } else if (max_nb_sectors > 0) {
33f461e0 3123 QEMUIOVector local_qiov;
33f461e0
KW
3124
3125 qemu_iovec_init(&local_qiov, qiov->niov);
3126 qemu_iovec_concat(&local_qiov, qiov, 0,
e012b78c 3127 max_nb_sectors * BDRV_SECTOR_SIZE);
33f461e0 3128
e012b78c 3129 ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors,
33f461e0
KW
3130 &local_qiov);
3131
3132 qemu_iovec_destroy(&local_qiov);
893a8f62
MK
3133 } else {
3134 ret = 0;
3135 }
3136
3137 /* Reading beyond end of file is supposed to produce zeroes */
3138 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3139 uint64_t offset = MAX(0, total_sectors - sector_num);
3140 uint64_t bytes = (sector_num + nb_sectors - offset) *
3141 BDRV_SECTOR_SIZE;
3142 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3143 }
3144 }
ab185921
SH
3145
3146out:
dbffbdcf 3147 return ret;
da1fa91d
KW
3148}
3149
fc3959e4
FZ
3150static inline uint64_t bdrv_get_align(BlockDriverState *bs)
3151{
3152 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3153 return MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3154}
3155
3156static inline bool bdrv_req_is_aligned(BlockDriverState *bs,
3157 int64_t offset, size_t bytes)
3158{
3159 int64_t align = bdrv_get_align(bs);
3160 return !(offset & (align - 1) || (bytes & (align - 1)));
3161}
3162
d0c7f642
KW
3163/*
3164 * Handle a read request in coroutine context
3165 */
1b0288ae
KW
3166static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3167 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
d0c7f642
KW
3168 BdrvRequestFlags flags)
3169{
3170 BlockDriver *drv = bs->drv;
65afd211
KW
3171 BdrvTrackedRequest req;
3172
fc3959e4 3173 uint64_t align = bdrv_get_align(bs);
1b0288ae
KW
3174 uint8_t *head_buf = NULL;
3175 uint8_t *tail_buf = NULL;
3176 QEMUIOVector local_qiov;
3177 bool use_local_qiov = false;
d0c7f642
KW
3178 int ret;
3179
3180 if (!drv) {
3181 return -ENOMEDIUM;
3182 }
b9c64947
HR
3183
3184 ret = bdrv_check_byte_request(bs, offset, bytes);
3185 if (ret < 0) {
3186 return ret;
d0c7f642
KW
3187 }
3188
3189 if (bs->copy_on_read) {
3190 flags |= BDRV_REQ_COPY_ON_READ;
3191 }
3192
3193 /* throttling disk I/O */
3194 if (bs->io_limits_enabled) {
d5103588 3195 bdrv_io_limits_intercept(bs, bytes, false);
1b0288ae
KW
3196 }
3197
3198 /* Align read if necessary by padding qiov */
3199 if (offset & (align - 1)) {
3200 head_buf = qemu_blockalign(bs, align);
3201 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3202 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3203 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3204 use_local_qiov = true;
3205
3206 bytes += offset & (align - 1);
3207 offset = offset & ~(align - 1);
3208 }
3209
3210 if ((offset + bytes) & (align - 1)) {
3211 if (!use_local_qiov) {
3212 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3213 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3214 use_local_qiov = true;
3215 }
3216 tail_buf = qemu_blockalign(bs, align);
3217 qemu_iovec_add(&local_qiov, tail_buf,
3218 align - ((offset + bytes) & (align - 1)));
3219
3220 bytes = ROUND_UP(bytes, align);
3221 }
3222
65afd211 3223 tracked_request_begin(&req, bs, offset, bytes, false);
ec746e10 3224 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
1b0288ae
KW
3225 use_local_qiov ? &local_qiov : qiov,
3226 flags);
65afd211 3227 tracked_request_end(&req);
1b0288ae
KW
3228
3229 if (use_local_qiov) {
3230 qemu_iovec_destroy(&local_qiov);
3231 qemu_vfree(head_buf);
3232 qemu_vfree(tail_buf);
d0c7f642
KW
3233 }
3234
d0c7f642
KW
3235 return ret;
3236}
3237
1b0288ae
KW
3238static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3239 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3240 BdrvRequestFlags flags)
3241{
75af1f34 3242 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1b0288ae
KW
3243 return -EINVAL;
3244 }
3245
3246 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3247 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3248}
3249
c5fbe571 3250int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
da1fa91d
KW
3251 int nb_sectors, QEMUIOVector *qiov)
3252{
c5fbe571 3253 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
da1fa91d 3254
470c0504
SH
3255 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3256}
3257
3258int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3259 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3260{
3261 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3262
3263 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3264 BDRV_REQ_COPY_ON_READ);
c5fbe571
SH
3265}
3266
98764152 3267#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
c31cb707 3268
f08f2dda 3269static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
aa7bfbff 3270 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
f08f2dda
SH
3271{
3272 BlockDriver *drv = bs->drv;
3273 QEMUIOVector qiov;
c31cb707
PL
3274 struct iovec iov = {0};
3275 int ret = 0;
f08f2dda 3276
75af1f34
PL
3277 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
3278 BDRV_REQUEST_MAX_SECTORS);
621f0589 3279
c31cb707
PL
3280 while (nb_sectors > 0 && !ret) {
3281 int num = nb_sectors;
3282
b8d71c09
PB
3283 /* Align request. Block drivers can expect the "bulk" of the request
3284 * to be aligned.
3285 */
3286 if (bs->bl.write_zeroes_alignment
3287 && num > bs->bl.write_zeroes_alignment) {
3288 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3289 /* Make a small request up to the first aligned sector. */
c31cb707 3290 num = bs->bl.write_zeroes_alignment;
b8d71c09
PB
3291 num -= sector_num % bs->bl.write_zeroes_alignment;
3292 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3293 /* Shorten the request to the last aligned sector. num cannot
3294 * underflow because num > bs->bl.write_zeroes_alignment.
3295 */
3296 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
c31cb707 3297 }
621f0589 3298 }
f08f2dda 3299
c31cb707
PL
3300 /* limit request size */
3301 if (num > max_write_zeroes) {
3302 num = max_write_zeroes;
3303 }
3304
3305 ret = -ENOTSUP;
3306 /* First try the efficient write zeroes operation */
3307 if (drv->bdrv_co_write_zeroes) {
3308 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3309 }
3310
3311 if (ret == -ENOTSUP) {
3312 /* Fall back to bounce buffer if write zeroes is unsupported */
095e4fa4 3313 int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
98764152 3314 MAX_WRITE_ZEROES_BOUNCE_BUFFER);
095e4fa4 3315 num = MIN(num, max_xfer_len);
c31cb707
PL
3316 iov.iov_len = num * BDRV_SECTOR_SIZE;
3317 if (iov.iov_base == NULL) {
857d4f46
KW
3318 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3319 if (iov.iov_base == NULL) {
3320 ret = -ENOMEM;
3321 goto fail;
3322 }
b8d71c09 3323 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
c31cb707
PL
3324 }
3325 qemu_iovec_init_external(&qiov, &iov, 1);
f08f2dda 3326
c31cb707 3327 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
b8d71c09
PB
3328
3329 /* Keep bounce buffer around if it is big enough for all
3330 * all future requests.
3331 */
095e4fa4 3332 if (num < max_xfer_len) {
b8d71c09
PB
3333 qemu_vfree(iov.iov_base);
3334 iov.iov_base = NULL;
3335 }
c31cb707
PL
3336 }
3337
3338 sector_num += num;
3339 nb_sectors -= num;
3340 }
f08f2dda 3341
857d4f46 3342fail:
f08f2dda
SH
3343 qemu_vfree(iov.iov_base);
3344 return ret;
3345}
3346
c5fbe571 3347/*
b404f720 3348 * Forwards an already correctly aligned write request to the BlockDriver.
c5fbe571 3349 */
b404f720 3350static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
65afd211
KW
3351 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3352 QEMUIOVector *qiov, int flags)
c5fbe571
SH
3353{
3354 BlockDriver *drv = bs->drv;
28de2dcd 3355 bool waited;
6b7cb247 3356 int ret;
da1fa91d 3357
b404f720
KW
3358 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3359 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
f4658285 3360
b404f720
KW
3361 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3362 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
8eb029c2 3363 assert(!qiov || bytes == qiov->size);
cc0681c4 3364
28de2dcd
KW
3365 waited = wait_serialising_requests(req);
3366 assert(!waited || !req->serialising);
af91f9a7
KW
3367 assert(req->overlap_offset <= offset);
3368 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
244eadef 3369
65afd211 3370 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
d616b224 3371
465bee1d
PL
3372 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3373 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3374 qemu_iovec_is_zero(qiov)) {
3375 flags |= BDRV_REQ_ZERO_WRITE;
3376 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3377 flags |= BDRV_REQ_MAY_UNMAP;
3378 }
3379 }
3380
d616b224
SH
3381 if (ret < 0) {
3382 /* Do nothing, write notifier decided to fail this request */
3383 } else if (flags & BDRV_REQ_ZERO_WRITE) {
9e1cb96d 3384 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
aa7bfbff 3385 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
f08f2dda 3386 } else {
9e1cb96d 3387 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
f08f2dda
SH
3388 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3389 }
9e1cb96d 3390 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
6b7cb247 3391
f05fa4ad
PB
3392 if (ret == 0 && !bs->enable_write_cache) {
3393 ret = bdrv_co_flush(bs);
3394 }
3395
e4654d2d 3396 bdrv_set_dirty(bs, sector_num, nb_sectors);
da1fa91d 3397
5366d0c8 3398 block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
5e5a94b6 3399
c0191e76 3400 if (ret >= 0) {
df2a6f29
PB
3401 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3402 }
da1fa91d 3403
6b7cb247 3404 return ret;
da1fa91d
KW
3405}
3406
b404f720
KW
3407/*
3408 * Handle a write request in coroutine context
3409 */
6601553e
KW
3410static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3411 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
b404f720
KW
3412 BdrvRequestFlags flags)
3413{
65afd211 3414 BdrvTrackedRequest req;
fc3959e4 3415 uint64_t align = bdrv_get_align(bs);
3b8242e0
KW
3416 uint8_t *head_buf = NULL;
3417 uint8_t *tail_buf = NULL;
3418 QEMUIOVector local_qiov;
3419 bool use_local_qiov = false;
b404f720
KW
3420 int ret;
3421
3422 if (!bs->drv) {
3423 return -ENOMEDIUM;
3424 }
3425 if (bs->read_only) {
3426 return -EACCES;
3427 }
b9c64947
HR
3428
3429 ret = bdrv_check_byte_request(bs, offset, bytes);
3430 if (ret < 0) {
3431 return ret;
b404f720
KW
3432 }
3433
b404f720
KW
3434 /* throttling disk I/O */
3435 if (bs->io_limits_enabled) {
d5103588 3436 bdrv_io_limits_intercept(bs, bytes, true);
b404f720
KW
3437 }
3438
3b8242e0
KW
3439 /*
3440 * Align write if necessary by performing a read-modify-write cycle.
3441 * Pad qiov with the read parts and be sure to have a tracked request not
3442 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3443 */
65afd211 3444 tracked_request_begin(&req, bs, offset, bytes, true);
3b8242e0
KW
3445
3446 if (offset & (align - 1)) {
3447 QEMUIOVector head_qiov;
3448 struct iovec head_iov;
3449
3450 mark_request_serialising(&req, align);
3451 wait_serialising_requests(&req);
3452
3453 head_buf = qemu_blockalign(bs, align);
3454 head_iov = (struct iovec) {
3455 .iov_base = head_buf,
3456 .iov_len = align,
3457 };
3458 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3459
9e1cb96d 3460 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3b8242e0
KW
3461 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3462 align, &head_qiov, 0);
3463 if (ret < 0) {
3464 goto fail;
3465 }
9e1cb96d 3466 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3b8242e0
KW
3467
3468 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3469 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3470 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3471 use_local_qiov = true;
3472
3473 bytes += offset & (align - 1);
3474 offset = offset & ~(align - 1);
3475 }
3476
3477 if ((offset + bytes) & (align - 1)) {
3478 QEMUIOVector tail_qiov;
3479 struct iovec tail_iov;
3480 size_t tail_bytes;
28de2dcd 3481 bool waited;
3b8242e0
KW
3482
3483 mark_request_serialising(&req, align);
28de2dcd
KW
3484 waited = wait_serialising_requests(&req);
3485 assert(!waited || !use_local_qiov);
3b8242e0
KW
3486
3487 tail_buf = qemu_blockalign(bs, align);
3488 tail_iov = (struct iovec) {
3489 .iov_base = tail_buf,
3490 .iov_len = align,
3491 };
3492 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3493
9e1cb96d 3494 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3b8242e0
KW
3495 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3496 align, &tail_qiov, 0);
3497 if (ret < 0) {
3498 goto fail;
3499 }
9e1cb96d 3500 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3b8242e0
KW
3501
3502 if (!use_local_qiov) {
3503 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3504 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3505 use_local_qiov = true;
3506 }
3507
3508 tail_bytes = (offset + bytes) & (align - 1);
3509 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3510
3511 bytes = ROUND_UP(bytes, align);
3512 }
3513
fc3959e4
FZ
3514 if (use_local_qiov) {
3515 /* Local buffer may have non-zero data. */
3516 flags &= ~BDRV_REQ_ZERO_WRITE;
3517 }
3b8242e0
KW
3518 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3519 use_local_qiov ? &local_qiov : qiov,
3520 flags);
3521
3522fail:
65afd211 3523 tracked_request_end(&req);
b404f720 3524
3b8242e0
KW
3525 if (use_local_qiov) {
3526 qemu_iovec_destroy(&local_qiov);
3b8242e0 3527 }
99c4a85c
KW
3528 qemu_vfree(head_buf);
3529 qemu_vfree(tail_buf);
3b8242e0 3530
b404f720
KW
3531 return ret;
3532}
3533
6601553e
KW
3534static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3535 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3536 BdrvRequestFlags flags)
3537{
75af1f34 3538 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
6601553e
KW
3539 return -EINVAL;
3540 }
3541
3542 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3543 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3544}
3545
c5fbe571
SH
3546int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3547 int nb_sectors, QEMUIOVector *qiov)
3548{
3549 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3550
f08f2dda
SH
3551 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3552}
3553
3554int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
aa7bfbff
PL
3555 int64_t sector_num, int nb_sectors,
3556 BdrvRequestFlags flags)
f08f2dda 3557{
fc3959e4
FZ
3558 int ret;
3559
94d6ff21 3560 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
f08f2dda 3561
d32f35cb
PL
3562 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3563 flags &= ~BDRV_REQ_MAY_UNMAP;
3564 }
fc3959e4
FZ
3565 if (bdrv_req_is_aligned(bs, sector_num << BDRV_SECTOR_BITS,
3566 nb_sectors << BDRV_SECTOR_BITS)) {
3567 ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3568 BDRV_REQ_ZERO_WRITE | flags);
3569 } else {
3570 uint8_t *buf;
3571 QEMUIOVector local_qiov;
3572 size_t bytes = nb_sectors << BDRV_SECTOR_BITS;
3573
3574 buf = qemu_memalign(bdrv_opt_mem_align(bs), bytes);
3575 memset(buf, 0, bytes);
3576 qemu_iovec_init(&local_qiov, 1);
3577 qemu_iovec_add(&local_qiov, buf, bytes);
d32f35cb 3578
fc3959e4
FZ
3579 ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, &local_qiov,
3580 BDRV_REQ_ZERO_WRITE | flags);
3581 qemu_vfree(buf);
3582 }
3583 return ret;
c5fbe571
SH
3584}
3585
83f64091
FB
3586/**
3587 * Truncate file to 'offset' bytes (needed only for file protocols)
3588 */
3589int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3590{
3591 BlockDriver *drv = bs->drv;
51762288 3592 int ret;
83f64091 3593 if (!drv)
19cb3738 3594 return -ENOMEDIUM;
83f64091
FB
3595 if (!drv->bdrv_truncate)
3596 return -ENOTSUP;
59f2689d
NS
3597 if (bs->read_only)
3598 return -EACCES;
9c75e168 3599
51762288
SH
3600 ret = drv->bdrv_truncate(bs, offset);
3601 if (ret == 0) {
3602 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
a7f53e26
MA
3603 if (bs->blk) {
3604 blk_dev_resize_cb(bs->blk);
3605 }
51762288
SH
3606 }
3607 return ret;
83f64091
FB
3608}
3609
4a1d5e1f
FZ
3610/**
3611 * Length of a allocated file in bytes. Sparse files are counted by actual
3612 * allocated space. Return < 0 if error or unknown.
3613 */
3614int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3615{
3616 BlockDriver *drv = bs->drv;
3617 if (!drv) {
3618 return -ENOMEDIUM;
3619 }
3620 if (drv->bdrv_get_allocated_file_size) {
3621 return drv->bdrv_get_allocated_file_size(bs);
3622 }
3623 if (bs->file) {
3624 return bdrv_get_allocated_file_size(bs->file);
3625 }
3626 return -ENOTSUP;
3627}
3628
83f64091 3629/**
65a9bb25 3630 * Return number of sectors on success, -errno on error.
83f64091 3631 */
65a9bb25 3632int64_t bdrv_nb_sectors(BlockDriverState *bs)
83f64091
FB
3633{
3634 BlockDriver *drv = bs->drv;
65a9bb25 3635
83f64091 3636 if (!drv)
19cb3738 3637 return -ENOMEDIUM;
51762288 3638
b94a2610
KW
3639 if (drv->has_variable_length) {
3640 int ret = refresh_total_sectors(bs, bs->total_sectors);
3641 if (ret < 0) {
3642 return ret;
46a4e4e6 3643 }
83f64091 3644 }
65a9bb25
MA
3645 return bs->total_sectors;
3646}
3647
3648/**
3649 * Return length in bytes on success, -errno on error.
3650 * The length is always a multiple of BDRV_SECTOR_SIZE.
3651 */
3652int64_t bdrv_getlength(BlockDriverState *bs)
3653{
3654 int64_t ret = bdrv_nb_sectors(bs);
3655
3656 return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
fc01f7e7
FB
3657}
3658
19cb3738 3659/* return 0 as number of sectors if no device present or error */
96b8f136 3660void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
fc01f7e7 3661{
65a9bb25
MA
3662 int64_t nb_sectors = bdrv_nb_sectors(bs);
3663
3664 *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
fc01f7e7 3665}
cf98951b 3666
ff06f5f3
PB
3667void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3668 BlockdevOnError on_write_error)
abd7f68d
MA
3669{
3670 bs->on_read_error = on_read_error;
3671 bs->on_write_error = on_write_error;
3672}
3673
1ceee0d5 3674BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
abd7f68d
MA
3675{
3676 return is_read ? bs->on_read_error : bs->on_write_error;
3677}
3678
3e1caa5f
PB
3679BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3680{
3681 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3682
3683 switch (on_err) {
3684 case BLOCKDEV_ON_ERROR_ENOSPC:
a589569f
WX
3685 return (error == ENOSPC) ?
3686 BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3e1caa5f 3687 case BLOCKDEV_ON_ERROR_STOP:
a589569f 3688 return BLOCK_ERROR_ACTION_STOP;
3e1caa5f 3689 case BLOCKDEV_ON_ERROR_REPORT:
a589569f 3690 return BLOCK_ERROR_ACTION_REPORT;
3e1caa5f 3691 case BLOCKDEV_ON_ERROR_IGNORE:
a589569f 3692 return BLOCK_ERROR_ACTION_IGNORE;
3e1caa5f
PB
3693 default:
3694 abort();
3695 }
3696}
3697
c7c2ff0c
LC
3698static void send_qmp_error_event(BlockDriverState *bs,
3699 BlockErrorAction action,
3700 bool is_read, int error)
3701{
573742a5 3702 IoOperationType optype;
c7c2ff0c 3703
573742a5
PM
3704 optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
3705 qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action,
c7c2ff0c 3706 bdrv_iostatus_is_enabled(bs),
624ff573
LC
3707 error == ENOSPC, strerror(error),
3708 &error_abort);
c7c2ff0c
LC
3709}
3710
3e1caa5f
PB
3711/* This is done by device models because, while the block layer knows
3712 * about the error, it does not know whether an operation comes from
3713 * the device or the block layer (from a job, for example).
3714 */
3715void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3716 bool is_read, int error)
3717{
3718 assert(error >= 0);
2bd3bce8 3719
a589569f 3720 if (action == BLOCK_ERROR_ACTION_STOP) {
2bd3bce8
PB
3721 /* First set the iostatus, so that "info block" returns an iostatus
3722 * that matches the events raised so far (an additional error iostatus
3723 * is fine, but not a lost one).
3724 */
3e1caa5f 3725 bdrv_iostatus_set_err(bs, error);
2bd3bce8
PB
3726
3727 /* Then raise the request to stop the VM and the event.
3728 * qemu_system_vmstop_request_prepare has two effects. First,
3729 * it ensures that the STOP event always comes after the
3730 * BLOCK_IO_ERROR event. Second, it ensures that even if management
3731 * can observe the STOP event and do a "cont" before the STOP
3732 * event is issued, the VM will not stop. In this case, vm_start()
3733 * also ensures that the STOP/RESUME pair of events is emitted.
3734 */
3735 qemu_system_vmstop_request_prepare();
c7c2ff0c 3736 send_qmp_error_event(bs, action, is_read, error);
2bd3bce8
PB
3737 qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3738 } else {
c7c2ff0c 3739 send_qmp_error_event(bs, action, is_read, error);
3e1caa5f
PB
3740 }
3741}
3742
b338082b
FB
3743int bdrv_is_read_only(BlockDriverState *bs)
3744{
3745 return bs->read_only;
3746}
3747
985a03b0
TS
3748int bdrv_is_sg(BlockDriverState *bs)
3749{
3750 return bs->sg;
3751}
3752
e900a7b7
CH
3753int bdrv_enable_write_cache(BlockDriverState *bs)
3754{
3755 return bs->enable_write_cache;
3756}
3757
425b0148
PB
3758void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3759{
3760 bs->enable_write_cache = wce;
55b110f2
JC
3761
3762 /* so a reopen() will preserve wce */
3763 if (wce) {
3764 bs->open_flags |= BDRV_O_CACHE_WB;
3765 } else {
3766 bs->open_flags &= ~BDRV_O_CACHE_WB;
3767 }
425b0148
PB
3768}
3769
ea2384d3
FB
3770int bdrv_is_encrypted(BlockDriverState *bs)
3771{
3772 if (bs->backing_hd && bs->backing_hd->encrypted)
3773 return 1;
3774 return bs->encrypted;
3775}
3776
c0f4ce77
AL
3777int bdrv_key_required(BlockDriverState *bs)
3778{
3779 BlockDriverState *backing_hd = bs->backing_hd;
3780
3781 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3782 return 1;
3783 return (bs->encrypted && !bs->valid_key);
3784}
3785
ea2384d3
FB
3786int bdrv_set_key(BlockDriverState *bs, const char *key)
3787{
3788 int ret;
3789 if (bs->backing_hd && bs->backing_hd->encrypted) {
3790 ret = bdrv_set_key(bs->backing_hd, key);
3791 if (ret < 0)
3792 return ret;
3793 if (!bs->encrypted)
3794 return 0;
3795 }
fd04a2ae
SH
3796 if (!bs->encrypted) {
3797 return -EINVAL;
3798 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3799 return -ENOMEDIUM;
3800 }
c0f4ce77 3801 ret = bs->drv->bdrv_set_key(bs, key);
bb5fc20f
AL
3802 if (ret < 0) {
3803 bs->valid_key = 0;
3804 } else if (!bs->valid_key) {
3805 bs->valid_key = 1;
a7f53e26
MA
3806 if (bs->blk) {
3807 /* call the change callback now, we skipped it on open */
3808 blk_dev_change_media_cb(bs->blk, true);
3809 }
bb5fc20f 3810 }
c0f4ce77 3811 return ret;
ea2384d3
FB
3812}
3813
4d2855a3
MA
3814/*
3815 * Provide an encryption key for @bs.
3816 * If @key is non-null:
3817 * If @bs is not encrypted, fail.
3818 * Else if the key is invalid, fail.
3819 * Else set @bs's key to @key, replacing the existing key, if any.
3820 * If @key is null:
3821 * If @bs is encrypted and still lacks a key, fail.
3822 * Else do nothing.
3823 * On failure, store an error object through @errp if non-null.
3824 */
3825void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp)
3826{
3827 if (key) {
3828 if (!bdrv_is_encrypted(bs)) {
81e5f78a
AG
3829 error_setg(errp, "Node '%s' is not encrypted",
3830 bdrv_get_device_or_node_name(bs));
4d2855a3
MA
3831 } else if (bdrv_set_key(bs, key) < 0) {
3832 error_set(errp, QERR_INVALID_PASSWORD);
3833 }
3834 } else {
3835 if (bdrv_key_required(bs)) {
b1ca6391
MA
3836 error_set(errp, ERROR_CLASS_DEVICE_ENCRYPTED,
3837 "'%s' (%s) is encrypted",
81e5f78a 3838 bdrv_get_device_or_node_name(bs),
4d2855a3
MA
3839 bdrv_get_encrypted_filename(bs));
3840 }
3841 }
3842}
3843
f8d6bba1 3844const char *bdrv_get_format_name(BlockDriverState *bs)
ea2384d3 3845{
f8d6bba1 3846 return bs->drv ? bs->drv->format_name : NULL;
ea2384d3
FB
3847}
3848
ada42401
SH
3849static int qsort_strcmp(const void *a, const void *b)
3850{
3851 return strcmp(a, b);
3852}
3853
5fafdf24 3854void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
ea2384d3
FB
3855 void *opaque)
3856{
3857 BlockDriver *drv;
e855e4fb 3858 int count = 0;
ada42401 3859 int i;
e855e4fb 3860 const char **formats = NULL;
ea2384d3 3861
8a22f02a 3862 QLIST_FOREACH(drv, &bdrv_drivers, list) {
e855e4fb
JC
3863 if (drv->format_name) {
3864 bool found = false;
3865 int i = count;
3866 while (formats && i && !found) {
3867 found = !strcmp(formats[--i], drv->format_name);
3868 }
3869
3870 if (!found) {
5839e53b 3871 formats = g_renew(const char *, formats, count + 1);
e855e4fb 3872 formats[count++] = drv->format_name;
e855e4fb
JC
3873 }
3874 }
ea2384d3 3875 }
ada42401
SH
3876
3877 qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3878
3879 for (i = 0; i < count; i++) {
3880 it(opaque, formats[i]);
3881 }
3882
e855e4fb 3883 g_free(formats);
ea2384d3
FB
3884}
3885
dc364f4c
BC
3886/* This function is to find a node in the bs graph */
3887BlockDriverState *bdrv_find_node(const char *node_name)
3888{
3889 BlockDriverState *bs;
3890
3891 assert(node_name);
3892
3893 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3894 if (!strcmp(node_name, bs->node_name)) {
3895 return bs;
3896 }
3897 }
3898 return NULL;
3899}
3900
c13163fb 3901/* Put this QMP function here so it can access the static graph_bdrv_states. */
d5a8ee60 3902BlockDeviceInfoList *bdrv_named_nodes_list(Error **errp)
c13163fb
BC
3903{
3904 BlockDeviceInfoList *list, *entry;
3905 BlockDriverState *bs;
3906
3907 list = NULL;
3908 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
d5a8ee60
AG
3909 BlockDeviceInfo *info = bdrv_block_device_info(bs, errp);
3910 if (!info) {
3911 qapi_free_BlockDeviceInfoList(list);
3912 return NULL;
3913 }
c13163fb 3914 entry = g_malloc0(sizeof(*entry));
d5a8ee60 3915 entry->value = info;
c13163fb
BC
3916 entry->next = list;
3917 list = entry;
3918 }
3919
3920 return list;
3921}
3922
12d3ba82
BC
3923BlockDriverState *bdrv_lookup_bs(const char *device,
3924 const char *node_name,
3925 Error **errp)
3926{
7f06d47e
MA
3927 BlockBackend *blk;
3928 BlockDriverState *bs;
12d3ba82 3929
12d3ba82 3930 if (device) {
7f06d47e 3931 blk = blk_by_name(device);
12d3ba82 3932
7f06d47e
MA
3933 if (blk) {
3934 return blk_bs(blk);
12d3ba82 3935 }
12d3ba82
BC
3936 }
3937
dd67fa50
BC
3938 if (node_name) {
3939 bs = bdrv_find_node(node_name);
12d3ba82 3940
dd67fa50
BC
3941 if (bs) {
3942 return bs;
3943 }
12d3ba82
BC
3944 }
3945
dd67fa50
BC
3946 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3947 device ? device : "",
3948 node_name ? node_name : "");
3949 return NULL;
12d3ba82
BC
3950}
3951
5a6684d2
JC
3952/* If 'base' is in the same chain as 'top', return true. Otherwise,
3953 * return false. If either argument is NULL, return false. */
3954bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3955{
3956 while (top && top != base) {
3957 top = top->backing_hd;
3958 }
3959
3960 return top != NULL;
3961}
3962
04df765a
FZ
3963BlockDriverState *bdrv_next_node(BlockDriverState *bs)
3964{
3965 if (!bs) {
3966 return QTAILQ_FIRST(&graph_bdrv_states);
3967 }
3968 return QTAILQ_NEXT(bs, node_list);
3969}
3970
2f399b0a
MA
3971BlockDriverState *bdrv_next(BlockDriverState *bs)
3972{
3973 if (!bs) {
3974 return QTAILQ_FIRST(&bdrv_states);
3975 }
dc364f4c 3976 return QTAILQ_NEXT(bs, device_list);
2f399b0a
MA
3977}
3978
20a9e77d
FZ
3979const char *bdrv_get_node_name(const BlockDriverState *bs)
3980{
3981 return bs->node_name;
3982}
3983
7f06d47e 3984/* TODO check what callers really want: bs->node_name or blk_name() */
bfb197e0 3985const char *bdrv_get_device_name(const BlockDriverState *bs)
ea2384d3 3986{
bfb197e0 3987 return bs->blk ? blk_name(bs->blk) : "";
ea2384d3
FB
3988}
3989
9b2aa84f
AG
3990/* This can be used to identify nodes that might not have a device
3991 * name associated. Since node and device names live in the same
3992 * namespace, the result is unambiguous. The exception is if both are
3993 * absent, then this returns an empty (non-null) string. */
3994const char *bdrv_get_device_or_node_name(const BlockDriverState *bs)
3995{
3996 return bs->blk ? blk_name(bs->blk) : bs->node_name;
3997}
3998
c8433287
MA
3999int bdrv_get_flags(BlockDriverState *bs)
4000{
4001 return bs->open_flags;
4002}
4003
f0f0fdfe 4004int bdrv_flush_all(void)
c6ca28d6
AL
4005{
4006 BlockDriverState *bs;
f0f0fdfe 4007 int result = 0;
c6ca28d6 4008
dc364f4c 4009 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
4010 AioContext *aio_context = bdrv_get_aio_context(bs);
4011 int ret;
4012
4013 aio_context_acquire(aio_context);
4014 ret = bdrv_flush(bs);
f0f0fdfe
KW
4015 if (ret < 0 && !result) {
4016 result = ret;
4017 }
ed78cda3 4018 aio_context_release(aio_context);
1b7bdbc1 4019 }
f0f0fdfe
KW
4020
4021 return result;
c6ca28d6
AL
4022}
4023
3ac21627
PL
4024int bdrv_has_zero_init_1(BlockDriverState *bs)
4025{
4026 return 1;
4027}
4028
f2feebbd
KW
4029int bdrv_has_zero_init(BlockDriverState *bs)
4030{
4031 assert(bs->drv);
4032
11212d8f
PB
4033 /* If BS is a copy on write image, it is initialized to
4034 the contents of the base image, which may not be zeroes. */
4035 if (bs->backing_hd) {
4036 return 0;
4037 }
336c1c12
KW
4038 if (bs->drv->bdrv_has_zero_init) {
4039 return bs->drv->bdrv_has_zero_init(bs);
f2feebbd
KW
4040 }
4041
3ac21627
PL
4042 /* safe default */
4043 return 0;
f2feebbd
KW
4044}
4045
4ce78691
PL
4046bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
4047{
4048 BlockDriverInfo bdi;
4049
4050 if (bs->backing_hd) {
4051 return false;
4052 }
4053
4054 if (bdrv_get_info(bs, &bdi) == 0) {
4055 return bdi.unallocated_blocks_are_zero;
4056 }
4057
4058 return false;
4059}
4060
4061bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
4062{
4063 BlockDriverInfo bdi;
4064
4065 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
4066 return false;
4067 }
4068
4069 if (bdrv_get_info(bs, &bdi) == 0) {
4070 return bdi.can_write_zeroes_with_unmap;
4071 }
4072
4073 return false;
4074}
4075
b6b8a333 4076typedef struct BdrvCoGetBlockStatusData {
376ae3f1 4077 BlockDriverState *bs;
b35b2bba 4078 BlockDriverState *base;
376ae3f1
SH
4079 int64_t sector_num;
4080 int nb_sectors;
4081 int *pnum;
b6b8a333 4082 int64_t ret;
376ae3f1 4083 bool done;
b6b8a333 4084} BdrvCoGetBlockStatusData;
376ae3f1 4085
f58c7b35 4086/*
705be728
FZ
4087 * Returns the allocation status of the specified sectors.
4088 * Drivers not implementing the functionality are assumed to not support
4089 * backing files, hence all their sectors are reported as allocated.
f58c7b35 4090 *
bd9533e3
SH
4091 * If 'sector_num' is beyond the end of the disk image the return value is 0
4092 * and 'pnum' is set to 0.
4093 *
f58c7b35
TS
4094 * 'pnum' is set to the number of sectors (including and immediately following
4095 * the specified sector) that are known to be in the same
4096 * allocated/unallocated state.
4097 *
bd9533e3
SH
4098 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
4099 * beyond the end of the disk image it will be clamped.
f58c7b35 4100 */
b6b8a333
PB
4101static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
4102 int64_t sector_num,
4103 int nb_sectors, int *pnum)
f58c7b35 4104{
30a7f2fc 4105 int64_t total_sectors;
bd9533e3 4106 int64_t n;
5daa74a6 4107 int64_t ret, ret2;
bd9533e3 4108
30a7f2fc
MA
4109 total_sectors = bdrv_nb_sectors(bs);
4110 if (total_sectors < 0) {
4111 return total_sectors;
617ccb46
PB
4112 }
4113
30a7f2fc 4114 if (sector_num >= total_sectors) {
bd9533e3
SH
4115 *pnum = 0;
4116 return 0;
4117 }
4118
30a7f2fc 4119 n = total_sectors - sector_num;
bd9533e3
SH
4120 if (n < nb_sectors) {
4121 nb_sectors = n;
4122 }
4123
b6b8a333 4124 if (!bs->drv->bdrv_co_get_block_status) {
bd9533e3 4125 *pnum = nb_sectors;
e88ae226 4126 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
918e92d7
PB
4127 if (bs->drv->protocol_name) {
4128 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
4129 }
4130 return ret;
f58c7b35 4131 }
6aebab14 4132
415b5b01
PB
4133 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
4134 if (ret < 0) {
3e0a233d 4135 *pnum = 0;
415b5b01
PB
4136 return ret;
4137 }
4138
92bc50a5
PL
4139 if (ret & BDRV_BLOCK_RAW) {
4140 assert(ret & BDRV_BLOCK_OFFSET_VALID);
4141 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4142 *pnum, pnum);
4143 }
4144
e88ae226
KW
4145 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
4146 ret |= BDRV_BLOCK_ALLOCATED;
4147 }
4148
c3d86884
PL
4149 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
4150 if (bdrv_unallocated_blocks_are_zero(bs)) {
f0ad5712 4151 ret |= BDRV_BLOCK_ZERO;
1f9db224 4152 } else if (bs->backing_hd) {
f0ad5712 4153 BlockDriverState *bs2 = bs->backing_hd;
30a7f2fc
MA
4154 int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
4155 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
f0ad5712
PB
4156 ret |= BDRV_BLOCK_ZERO;
4157 }
4158 }
415b5b01 4159 }
5daa74a6
PB
4160
4161 if (bs->file &&
4162 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
4163 (ret & BDRV_BLOCK_OFFSET_VALID)) {
59c9a95f
HR
4164 int file_pnum;
4165
5daa74a6 4166 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
59c9a95f 4167 *pnum, &file_pnum);
5daa74a6
PB
4168 if (ret2 >= 0) {
4169 /* Ignore errors. This is just providing extra information, it
4170 * is useful but not necessary.
4171 */
59c9a95f
HR
4172 if (!file_pnum) {
4173 /* !file_pnum indicates an offset at or beyond the EOF; it is
4174 * perfectly valid for the format block driver to point to such
4175 * offsets, so catch it and mark everything as zero */
4176 ret |= BDRV_BLOCK_ZERO;
4177 } else {
4178 /* Limit request to the range reported by the protocol driver */
4179 *pnum = file_pnum;
4180 ret |= (ret2 & BDRV_BLOCK_ZERO);
4181 }
5daa74a6
PB
4182 }
4183 }
4184
415b5b01 4185 return ret;
060f51c9
SH
4186}
4187
b6b8a333
PB
4188/* Coroutine wrapper for bdrv_get_block_status() */
4189static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
060f51c9 4190{
b6b8a333 4191 BdrvCoGetBlockStatusData *data = opaque;
060f51c9
SH
4192 BlockDriverState *bs = data->bs;
4193
b6b8a333
PB
4194 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4195 data->pnum);
060f51c9
SH
4196 data->done = true;
4197}
4198
4199/*
b6b8a333 4200 * Synchronous wrapper around bdrv_co_get_block_status().
060f51c9 4201 *
b6b8a333 4202 * See bdrv_co_get_block_status() for details.
060f51c9 4203 */
b6b8a333
PB
4204int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4205 int nb_sectors, int *pnum)
060f51c9 4206{
6aebab14 4207 Coroutine *co;
b6b8a333 4208 BdrvCoGetBlockStatusData data = {
6aebab14
SH
4209 .bs = bs,
4210 .sector_num = sector_num,
4211 .nb_sectors = nb_sectors,
4212 .pnum = pnum,
4213 .done = false,
4214 };
4215
bdad13b9
PB
4216 if (qemu_in_coroutine()) {
4217 /* Fast-path if already in coroutine context */
b6b8a333 4218 bdrv_get_block_status_co_entry(&data);
bdad13b9 4219 } else {
2572b37a
SH
4220 AioContext *aio_context = bdrv_get_aio_context(bs);
4221
b6b8a333 4222 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
bdad13b9
PB
4223 qemu_coroutine_enter(co, &data);
4224 while (!data.done) {
2572b37a 4225 aio_poll(aio_context, true);
bdad13b9 4226 }
6aebab14
SH
4227 }
4228 return data.ret;
f58c7b35
TS
4229}
4230
b6b8a333
PB
4231int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4232 int nb_sectors, int *pnum)
4233{
4333bb71
PB
4234 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4235 if (ret < 0) {
4236 return ret;
4237 }
01fb2705 4238 return !!(ret & BDRV_BLOCK_ALLOCATED);
b6b8a333
PB
4239}
4240
188a7bbf
PB
4241/*
4242 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4243 *
4244 * Return true if the given sector is allocated in any image between
4245 * BASE and TOP (inclusive). BASE can be NULL to check if the given
4246 * sector is allocated in any image of the chain. Return false otherwise.
4247 *
4248 * 'pnum' is set to the number of sectors (including and immediately following
4249 * the specified sector) that are known to be in the same
4250 * allocated/unallocated state.
4251 *
4252 */
4f578637
PB
4253int bdrv_is_allocated_above(BlockDriverState *top,
4254 BlockDriverState *base,
4255 int64_t sector_num,
4256 int nb_sectors, int *pnum)
188a7bbf
PB
4257{
4258 BlockDriverState *intermediate;
4259 int ret, n = nb_sectors;
4260
4261 intermediate = top;
4262 while (intermediate && intermediate != base) {
4263 int pnum_inter;
bdad13b9
PB
4264 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4265 &pnum_inter);
188a7bbf
PB
4266 if (ret < 0) {
4267 return ret;
4268 } else if (ret) {
4269 *pnum = pnum_inter;
4270 return 1;
4271 }
4272
4273 /*
4274 * [sector_num, nb_sectors] is unallocated on top but intermediate
4275 * might have
4276 *
4277 * [sector_num+x, nr_sectors] allocated.
4278 */
63ba17d3
VI
4279 if (n > pnum_inter &&
4280 (intermediate == top ||
4281 sector_num + pnum_inter < intermediate->total_sectors)) {
188a7bbf
PB
4282 n = pnum_inter;
4283 }
4284
4285 intermediate = intermediate->backing_hd;
4286 }
4287
4288 *pnum = n;
4289 return 0;
4290}
4291
045df330
AL
4292const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4293{
4294 if (bs->backing_hd && bs->backing_hd->encrypted)
4295 return bs->backing_file;
4296 else if (bs->encrypted)
4297 return bs->filename;
4298 else
4299 return NULL;
4300}
4301
5fafdf24 4302void bdrv_get_backing_filename(BlockDriverState *bs,
83f64091
FB
4303 char *filename, int filename_size)
4304{
3574c608 4305 pstrcpy(filename, filename_size, bs->backing_file);
83f64091
FB
4306}
4307
5fafdf24 4308int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
faea38e7
FB
4309 const uint8_t *buf, int nb_sectors)
4310{
4311 BlockDriver *drv = bs->drv;
b9c64947
HR
4312 int ret;
4313
4314 if (!drv) {
19cb3738 4315 return -ENOMEDIUM;
b9c64947
HR
4316 }
4317 if (!drv->bdrv_write_compressed) {
faea38e7 4318 return -ENOTSUP;
b9c64947
HR
4319 }
4320 ret = bdrv_check_request(bs, sector_num, nb_sectors);
4321 if (ret < 0) {
4322 return ret;
4323 }
a55eb92c 4324
e4654d2d 4325 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
a55eb92c 4326
faea38e7
FB
4327 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4328}
3b46e624 4329
faea38e7
FB
4330int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4331{
4332 BlockDriver *drv = bs->drv;
4333 if (!drv)
19cb3738 4334 return -ENOMEDIUM;
faea38e7
FB
4335 if (!drv->bdrv_get_info)
4336 return -ENOTSUP;
4337 memset(bdi, 0, sizeof(*bdi));
4338 return drv->bdrv_get_info(bs, bdi);
4339}
4340
eae041fe
HR
4341ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4342{
4343 BlockDriver *drv = bs->drv;
4344 if (drv && drv->bdrv_get_specific_info) {
4345 return drv->bdrv_get_specific_info(bs);
4346 }
4347 return NULL;
4348}
4349
45566e9c
CH
4350int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4351 int64_t pos, int size)
cf8074b3
KW
4352{
4353 QEMUIOVector qiov;
4354 struct iovec iov = {
4355 .iov_base = (void *) buf,
4356 .iov_len = size,
4357 };
4358
4359 qemu_iovec_init_external(&qiov, &iov, 1);
4360 return bdrv_writev_vmstate(bs, &qiov, pos);
4361}
4362
4363int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
178e08a5
AL
4364{
4365 BlockDriver *drv = bs->drv;
cf8074b3
KW
4366
4367 if (!drv) {
178e08a5 4368 return -ENOMEDIUM;
cf8074b3
KW
4369 } else if (drv->bdrv_save_vmstate) {
4370 return drv->bdrv_save_vmstate(bs, qiov, pos);
4371 } else if (bs->file) {
4372 return bdrv_writev_vmstate(bs->file, qiov, pos);
4373 }
4374
7cdb1f6d 4375 return -ENOTSUP;
178e08a5
AL
4376}
4377
45566e9c
CH
4378int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4379 int64_t pos, int size)
178e08a5
AL
4380{
4381 BlockDriver *drv = bs->drv;
4382 if (!drv)
4383 return -ENOMEDIUM;
7cdb1f6d
MK
4384 if (drv->bdrv_load_vmstate)
4385 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4386 if (bs->file)
4387 return bdrv_load_vmstate(bs->file, buf, pos, size);
4388 return -ENOTSUP;
178e08a5
AL
4389}
4390
8b9b0cc2
KW
4391void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4392{
bf736fe3 4393 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
8b9b0cc2
KW
4394 return;
4395 }
4396
bf736fe3 4397 bs->drv->bdrv_debug_event(bs, event);
41c695c7
KW
4398}
4399
4400int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4401 const char *tag)
4402{
4403 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4404 bs = bs->file;
4405 }
4406
4407 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4408 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4409 }
4410
4411 return -ENOTSUP;
4412}
4413
4cc70e93
FZ
4414int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4415{
4416 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4417 bs = bs->file;
4418 }
4419
4420 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4421 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4422 }
4423
4424 return -ENOTSUP;
4425}
4426
41c695c7
KW
4427int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4428{
938789ea 4429 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
41c695c7
KW
4430 bs = bs->file;
4431 }
8b9b0cc2 4432
41c695c7
KW
4433 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4434 return bs->drv->bdrv_debug_resume(bs, tag);
4435 }
4436
4437 return -ENOTSUP;
4438}
4439
4440bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4441{
4442 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4443 bs = bs->file;
4444 }
4445
4446 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4447 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4448 }
4449
4450 return false;
8b9b0cc2
KW
4451}
4452
199630b6
BS
4453int bdrv_is_snapshot(BlockDriverState *bs)
4454{
4455 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4456}
4457
b1b1d783
JC
4458/* backing_file can either be relative, or absolute, or a protocol. If it is
4459 * relative, it must be relative to the chain. So, passing in bs->filename
4460 * from a BDS as backing_file should not be done, as that may be relative to
4461 * the CWD rather than the chain. */
e8a6bb9c
MT
4462BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4463 const char *backing_file)
4464{
b1b1d783
JC
4465 char *filename_full = NULL;
4466 char *backing_file_full = NULL;
4467 char *filename_tmp = NULL;
4468 int is_protocol = 0;
4469 BlockDriverState *curr_bs = NULL;
4470 BlockDriverState *retval = NULL;
4471
4472 if (!bs || !bs->drv || !backing_file) {
e8a6bb9c
MT
4473 return NULL;
4474 }
4475
b1b1d783
JC
4476 filename_full = g_malloc(PATH_MAX);
4477 backing_file_full = g_malloc(PATH_MAX);
4478 filename_tmp = g_malloc(PATH_MAX);
4479
4480 is_protocol = path_has_protocol(backing_file);
4481
4482 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4483
4484 /* If either of the filename paths is actually a protocol, then
4485 * compare unmodified paths; otherwise make paths relative */
4486 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4487 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4488 retval = curr_bs->backing_hd;
4489 break;
4490 }
e8a6bb9c 4491 } else {
b1b1d783
JC
4492 /* If not an absolute filename path, make it relative to the current
4493 * image's filename path */
4494 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4495 backing_file);
4496
4497 /* We are going to compare absolute pathnames */
4498 if (!realpath(filename_tmp, filename_full)) {
4499 continue;
4500 }
4501
4502 /* We need to make sure the backing filename we are comparing against
4503 * is relative to the current image filename (or absolute) */
4504 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4505 curr_bs->backing_file);
4506
4507 if (!realpath(filename_tmp, backing_file_full)) {
4508 continue;
4509 }
4510
4511 if (strcmp(backing_file_full, filename_full) == 0) {
4512 retval = curr_bs->backing_hd;
4513 break;
4514 }
e8a6bb9c
MT
4515 }
4516 }
4517
b1b1d783
JC
4518 g_free(filename_full);
4519 g_free(backing_file_full);
4520 g_free(filename_tmp);
4521 return retval;
e8a6bb9c
MT
4522}
4523
f198fd1c
BC
4524int bdrv_get_backing_file_depth(BlockDriverState *bs)
4525{
4526 if (!bs->drv) {
4527 return 0;
4528 }
4529
4530 if (!bs->backing_hd) {
4531 return 0;
4532 }
4533
4534 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4535}
4536
ea2384d3 4537/**************************************************************/
83f64091 4538/* async I/Os */
ea2384d3 4539
7c84b1b8
MA
4540BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4541 QEMUIOVector *qiov, int nb_sectors,
097310b5 4542 BlockCompletionFunc *cb, void *opaque)
83f64091 4543{
bbf0a440
SH
4544 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4545
d20d9b7c 4546 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
8c5873d6 4547 cb, opaque, false);
ea2384d3
FB
4548}
4549
7c84b1b8
MA
4550BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4551 QEMUIOVector *qiov, int nb_sectors,
097310b5 4552 BlockCompletionFunc *cb, void *opaque)
ea2384d3 4553{
bbf0a440
SH
4554 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4555
d20d9b7c 4556 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
8c5873d6 4557 cb, opaque, true);
83f64091
FB
4558}
4559
7c84b1b8 4560BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
d5ef94d4 4561 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
097310b5 4562 BlockCompletionFunc *cb, void *opaque)
d5ef94d4
PB
4563{
4564 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4565
4566 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4567 BDRV_REQ_ZERO_WRITE | flags,
4568 cb, opaque, true);
4569}
4570
40b4f539
KW
4571
4572typedef struct MultiwriteCB {
4573 int error;
4574 int num_requests;
4575 int num_callbacks;
4576 struct {
097310b5 4577 BlockCompletionFunc *cb;
40b4f539
KW
4578 void *opaque;
4579 QEMUIOVector *free_qiov;
40b4f539
KW
4580 } callbacks[];
4581} MultiwriteCB;
4582
4583static void multiwrite_user_cb(MultiwriteCB *mcb)
4584{
4585 int i;
4586
4587 for (i = 0; i < mcb->num_callbacks; i++) {
4588 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
1e1ea48d
SH
4589 if (mcb->callbacks[i].free_qiov) {
4590 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4591 }
7267c094 4592 g_free(mcb->callbacks[i].free_qiov);
40b4f539
KW
4593 }
4594}
4595
4596static void multiwrite_cb(void *opaque, int ret)
4597{
4598 MultiwriteCB *mcb = opaque;
4599
6d519a5f
SH
4600 trace_multiwrite_cb(mcb, ret);
4601
cb6d3ca0 4602 if (ret < 0 && !mcb->error) {
40b4f539 4603 mcb->error = ret;
40b4f539
KW
4604 }
4605
4606 mcb->num_requests--;
4607 if (mcb->num_requests == 0) {
de189a1b 4608 multiwrite_user_cb(mcb);
7267c094 4609 g_free(mcb);
40b4f539
KW
4610 }
4611}
4612
4613static int multiwrite_req_compare(const void *a, const void *b)
4614{
77be4366
CH
4615 const BlockRequest *req1 = a, *req2 = b;
4616
4617 /*
4618 * Note that we can't simply subtract req2->sector from req1->sector
4619 * here as that could overflow the return value.
4620 */
4621 if (req1->sector > req2->sector) {
4622 return 1;
4623 } else if (req1->sector < req2->sector) {
4624 return -1;
4625 } else {
4626 return 0;
4627 }
40b4f539
KW
4628}
4629
4630/*
4631 * Takes a bunch of requests and tries to merge them. Returns the number of
4632 * requests that remain after merging.
4633 */
4634static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4635 int num_reqs, MultiwriteCB *mcb)
4636{
4637 int i, outidx;
4638
4639 // Sort requests by start sector
4640 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4641
4642 // Check if adjacent requests touch the same clusters. If so, combine them,
4643 // filling up gaps with zero sectors.
4644 outidx = 0;
4645 for (i = 1; i < num_reqs; i++) {
4646 int merge = 0;
4647 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4648
b6a127a1 4649 // Handle exactly sequential writes and overlapping writes.
40b4f539
KW
4650 if (reqs[i].sector <= oldreq_last) {
4651 merge = 1;
4652 }
4653
e2a305fb
CH
4654 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4655 merge = 0;
4656 }
4657
6c5a42ac
PL
4658 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
4659 reqs[i].nb_sectors > bs->bl.max_transfer_length) {
4660 merge = 0;
4661 }
4662
40b4f539
KW
4663 if (merge) {
4664 size_t size;
7267c094 4665 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
40b4f539
KW
4666 qemu_iovec_init(qiov,
4667 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4668
4669 // Add the first request to the merged one. If the requests are
4670 // overlapping, drop the last sectors of the first request.
4671 size = (reqs[i].sector - reqs[outidx].sector) << 9;
1b093c48 4672 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
40b4f539 4673
b6a127a1
PB
4674 // We should need to add any zeros between the two requests
4675 assert (reqs[i].sector <= oldreq_last);
40b4f539
KW
4676
4677 // Add the second request
1b093c48 4678 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
40b4f539 4679
391827eb
SH
4680 // Add tail of first request, if necessary
4681 if (qiov->size < reqs[outidx].qiov->size) {
4682 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4683 reqs[outidx].qiov->size - qiov->size);
4684 }
4685
cbf1dff2 4686 reqs[outidx].nb_sectors = qiov->size >> 9;
40b4f539
KW
4687 reqs[outidx].qiov = qiov;
4688
4689 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4690 } else {
4691 outidx++;
4692 reqs[outidx].sector = reqs[i].sector;
4693 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4694 reqs[outidx].qiov = reqs[i].qiov;
4695 }
4696 }
4697
f4564d53
PL
4698 block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1);
4699
40b4f539
KW
4700 return outidx + 1;
4701}
4702
4703/*
4704 * Submit multiple AIO write requests at once.
4705 *
4706 * On success, the function returns 0 and all requests in the reqs array have
4707 * been submitted. In error case this function returns -1, and any of the
4708 * requests may or may not be submitted yet. In particular, this means that the
4709 * callback will be called for some of the requests, for others it won't. The
4710 * caller must check the error field of the BlockRequest to wait for the right
4711 * callbacks (if error != 0, no callback will be called).
4712 *
4713 * The implementation may modify the contents of the reqs array, e.g. to merge
4714 * requests. However, the fields opaque and error are left unmodified as they
4715 * are used to signal failure for a single request to the caller.
4716 */
4717int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4718{
40b4f539
KW
4719 MultiwriteCB *mcb;
4720 int i;
4721
301db7c2
RH
4722 /* don't submit writes if we don't have a medium */
4723 if (bs->drv == NULL) {
4724 for (i = 0; i < num_reqs; i++) {
4725 reqs[i].error = -ENOMEDIUM;
4726 }
4727 return -1;
4728 }
4729
40b4f539
KW
4730 if (num_reqs == 0) {
4731 return 0;
4732 }
4733
4734 // Create MultiwriteCB structure
7267c094 4735 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
40b4f539
KW
4736 mcb->num_requests = 0;
4737 mcb->num_callbacks = num_reqs;
4738
4739 for (i = 0; i < num_reqs; i++) {
4740 mcb->callbacks[i].cb = reqs[i].cb;
4741 mcb->callbacks[i].opaque = reqs[i].opaque;
4742 }
4743
4744 // Check for mergable requests
4745 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4746
6d519a5f
SH
4747 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4748
df9309fb
PB
4749 /* Run the aio requests. */
4750 mcb->num_requests = num_reqs;
40b4f539 4751 for (i = 0; i < num_reqs; i++) {
d20d9b7c
PB
4752 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4753 reqs[i].nb_sectors, reqs[i].flags,
4754 multiwrite_cb, mcb,
4755 true);
40b4f539
KW
4756 }
4757
4758 return 0;
40b4f539
KW
4759}
4760
7c84b1b8 4761void bdrv_aio_cancel(BlockAIOCB *acb)
83f64091 4762{
ca5fd113
FZ
4763 qemu_aio_ref(acb);
4764 bdrv_aio_cancel_async(acb);
4765 while (acb->refcnt > 1) {
4766 if (acb->aiocb_info->get_aio_context) {
4767 aio_poll(acb->aiocb_info->get_aio_context(acb), true);
4768 } else if (acb->bs) {
4769 aio_poll(bdrv_get_aio_context(acb->bs), true);
4770 } else {
4771 abort();
02c50efe 4772 }
02c50efe 4773 }
8007429a 4774 qemu_aio_unref(acb);
02c50efe
FZ
4775}
4776
4777/* Async version of aio cancel. The caller is not blocked if the acb implements
4778 * cancel_async, otherwise we do nothing and let the request normally complete.
4779 * In either case the completion callback must be called. */
7c84b1b8 4780void bdrv_aio_cancel_async(BlockAIOCB *acb)
02c50efe
FZ
4781{
4782 if (acb->aiocb_info->cancel_async) {
4783 acb->aiocb_info->cancel_async(acb);
4784 }
83f64091
FB
4785}
4786
4787/**************************************************************/
4788/* async block device emulation */
4789
7c84b1b8
MA
4790typedef struct BlockAIOCBSync {
4791 BlockAIOCB common;
c16b5a2c
CH
4792 QEMUBH *bh;
4793 int ret;
4794 /* vector translation state */
4795 QEMUIOVector *qiov;
4796 uint8_t *bounce;
4797 int is_write;
7c84b1b8 4798} BlockAIOCBSync;
c16b5a2c 4799
d7331bed 4800static const AIOCBInfo bdrv_em_aiocb_info = {
7c84b1b8 4801 .aiocb_size = sizeof(BlockAIOCBSync),
c16b5a2c
CH
4802};
4803
ce1a14dc 4804static void bdrv_aio_bh_cb(void *opaque)
83f64091 4805{
7c84b1b8 4806 BlockAIOCBSync *acb = opaque;
f141eafe 4807
857d4f46 4808 if (!acb->is_write && acb->ret >= 0) {
03396148 4809 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
857d4f46 4810 }
ceb42de8 4811 qemu_vfree(acb->bounce);
ce1a14dc 4812 acb->common.cb(acb->common.opaque, acb->ret);
6a7ad299 4813 qemu_bh_delete(acb->bh);
36afc451 4814 acb->bh = NULL;
8007429a 4815 qemu_aio_unref(acb);
83f64091 4816}
beac80cd 4817
7c84b1b8
MA
4818static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4819 int64_t sector_num,
4820 QEMUIOVector *qiov,
4821 int nb_sectors,
097310b5 4822 BlockCompletionFunc *cb,
7c84b1b8
MA
4823 void *opaque,
4824 int is_write)
f141eafe 4825
83f64091 4826{
7c84b1b8 4827 BlockAIOCBSync *acb;
ce1a14dc 4828
d7331bed 4829 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
f141eafe
AL
4830 acb->is_write = is_write;
4831 acb->qiov = qiov;
857d4f46 4832 acb->bounce = qemu_try_blockalign(bs, qiov->size);
2572b37a 4833 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
f141eafe 4834
857d4f46
KW
4835 if (acb->bounce == NULL) {
4836 acb->ret = -ENOMEM;
4837 } else if (is_write) {
d5e6b161 4838 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
1ed20acf 4839 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
f141eafe 4840 } else {
1ed20acf 4841 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
f141eafe
AL
4842 }
4843
ce1a14dc 4844 qemu_bh_schedule(acb->bh);
f141eafe 4845
ce1a14dc 4846 return &acb->common;
beac80cd
FB
4847}
4848
7c84b1b8 4849static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
f141eafe 4850 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
097310b5 4851 BlockCompletionFunc *cb, void *opaque)
beac80cd 4852{
f141eafe
AL
4853 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4854}
83f64091 4855
7c84b1b8 4856static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
f141eafe 4857 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
097310b5 4858 BlockCompletionFunc *cb, void *opaque)
f141eafe
AL
4859{
4860 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
beac80cd 4861}
beac80cd 4862
68485420 4863
7c84b1b8
MA
4864typedef struct BlockAIOCBCoroutine {
4865 BlockAIOCB common;
68485420
KW
4866 BlockRequest req;
4867 bool is_write;
0b5a2445 4868 bool need_bh;
d318aea9 4869 bool *done;
68485420 4870 QEMUBH* bh;
7c84b1b8 4871} BlockAIOCBCoroutine;
68485420 4872
d7331bed 4873static const AIOCBInfo bdrv_em_co_aiocb_info = {
7c84b1b8 4874 .aiocb_size = sizeof(BlockAIOCBCoroutine),
68485420
KW
4875};
4876
0b5a2445
PB
4877static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
4878{
4879 if (!acb->need_bh) {
4880 acb->common.cb(acb->common.opaque, acb->req.error);
4881 qemu_aio_unref(acb);
4882 }
4883}
4884
35246a68 4885static void bdrv_co_em_bh(void *opaque)
68485420 4886{
7c84b1b8 4887 BlockAIOCBCoroutine *acb = opaque;
68485420 4888
0b5a2445 4889 assert(!acb->need_bh);
68485420 4890 qemu_bh_delete(acb->bh);
0b5a2445
PB
4891 bdrv_co_complete(acb);
4892}
4893
4894static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
4895{
4896 acb->need_bh = false;
4897 if (acb->req.error != -EINPROGRESS) {
4898 BlockDriverState *bs = acb->common.bs;
4899
4900 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4901 qemu_bh_schedule(acb->bh);
4902 }
68485420
KW
4903}
4904
b2a61371
SH
4905/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4906static void coroutine_fn bdrv_co_do_rw(void *opaque)
4907{
7c84b1b8 4908 BlockAIOCBCoroutine *acb = opaque;
b2a61371
SH
4909 BlockDriverState *bs = acb->common.bs;
4910
4911 if (!acb->is_write) {
4912 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
d20d9b7c 4913 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
b2a61371
SH
4914 } else {
4915 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
d20d9b7c 4916 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
b2a61371
SH
4917 }
4918
0b5a2445 4919 bdrv_co_complete(acb);
b2a61371
SH
4920}
4921
7c84b1b8
MA
4922static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4923 int64_t sector_num,
4924 QEMUIOVector *qiov,
4925 int nb_sectors,
4926 BdrvRequestFlags flags,
097310b5 4927 BlockCompletionFunc *cb,
7c84b1b8
MA
4928 void *opaque,
4929 bool is_write)
68485420
KW
4930{
4931 Coroutine *co;
7c84b1b8 4932 BlockAIOCBCoroutine *acb;
68485420 4933
d7331bed 4934 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
0b5a2445
PB
4935 acb->need_bh = true;
4936 acb->req.error = -EINPROGRESS;
68485420
KW
4937 acb->req.sector = sector_num;
4938 acb->req.nb_sectors = nb_sectors;
4939 acb->req.qiov = qiov;
d20d9b7c 4940 acb->req.flags = flags;
68485420
KW
4941 acb->is_write = is_write;
4942
8c5873d6 4943 co = qemu_coroutine_create(bdrv_co_do_rw);
68485420
KW
4944 qemu_coroutine_enter(co, acb);
4945
0b5a2445 4946 bdrv_co_maybe_schedule_bh(acb);
68485420
KW
4947 return &acb->common;
4948}
4949
07f07615 4950static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
b2e12bc6 4951{
7c84b1b8 4952 BlockAIOCBCoroutine *acb = opaque;
07f07615 4953 BlockDriverState *bs = acb->common.bs;
b2e12bc6 4954
07f07615 4955 acb->req.error = bdrv_co_flush(bs);
0b5a2445 4956 bdrv_co_complete(acb);
b2e12bc6
CH
4957}
4958
7c84b1b8 4959BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
097310b5 4960 BlockCompletionFunc *cb, void *opaque)
016f5cf6 4961{
07f07615 4962 trace_bdrv_aio_flush(bs, opaque);
016f5cf6 4963
07f07615 4964 Coroutine *co;
7c84b1b8 4965 BlockAIOCBCoroutine *acb;
016f5cf6 4966
d7331bed 4967 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
0b5a2445
PB
4968 acb->need_bh = true;
4969 acb->req.error = -EINPROGRESS;
d318aea9 4970
07f07615
PB
4971 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4972 qemu_coroutine_enter(co, acb);
016f5cf6 4973
0b5a2445 4974 bdrv_co_maybe_schedule_bh(acb);
016f5cf6
AG
4975 return &acb->common;
4976}
4977
4265d620
PB
4978static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4979{
7c84b1b8 4980 BlockAIOCBCoroutine *acb = opaque;
4265d620
PB
4981 BlockDriverState *bs = acb->common.bs;
4982
4983 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
0b5a2445 4984 bdrv_co_complete(acb);
4265d620
PB
4985}
4986
7c84b1b8 4987BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4265d620 4988 int64_t sector_num, int nb_sectors,
097310b5 4989 BlockCompletionFunc *cb, void *opaque)
4265d620
PB
4990{
4991 Coroutine *co;
7c84b1b8 4992 BlockAIOCBCoroutine *acb;
4265d620
PB
4993
4994 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4995
d7331bed 4996 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
0b5a2445
PB
4997 acb->need_bh = true;
4998 acb->req.error = -EINPROGRESS;
4265d620
PB
4999 acb->req.sector = sector_num;
5000 acb->req.nb_sectors = nb_sectors;
5001 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
5002 qemu_coroutine_enter(co, acb);
5003
0b5a2445 5004 bdrv_co_maybe_schedule_bh(acb);
4265d620
PB
5005 return &acb->common;
5006}
5007
ea2384d3
FB
5008void bdrv_init(void)
5009{
5efa9d5a 5010 module_call_init(MODULE_INIT_BLOCK);
ea2384d3 5011}
ce1a14dc 5012
eb852011
MA
5013void bdrv_init_with_whitelist(void)
5014{
5015 use_bdrv_whitelist = 1;
5016 bdrv_init();
5017}
5018
d7331bed 5019void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
097310b5 5020 BlockCompletionFunc *cb, void *opaque)
ce1a14dc 5021{
7c84b1b8 5022 BlockAIOCB *acb;
ce1a14dc 5023
d7331bed
SH
5024 acb = g_slice_alloc(aiocb_info->aiocb_size);
5025 acb->aiocb_info = aiocb_info;
ce1a14dc
PB
5026 acb->bs = bs;
5027 acb->cb = cb;
5028 acb->opaque = opaque;
f197fe2b 5029 acb->refcnt = 1;
ce1a14dc
PB
5030 return acb;
5031}
5032
f197fe2b
FZ
5033void qemu_aio_ref(void *p)
5034{
7c84b1b8 5035 BlockAIOCB *acb = p;
f197fe2b
FZ
5036 acb->refcnt++;
5037}
5038
8007429a 5039void qemu_aio_unref(void *p)
ce1a14dc 5040{
7c84b1b8 5041 BlockAIOCB *acb = p;
f197fe2b
FZ
5042 assert(acb->refcnt > 0);
5043 if (--acb->refcnt == 0) {
5044 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
5045 }
ce1a14dc 5046}
19cb3738 5047
f9f05dc5
KW
5048/**************************************************************/
5049/* Coroutine block device emulation */
5050
5051typedef struct CoroutineIOCompletion {
5052 Coroutine *coroutine;
5053 int ret;
5054} CoroutineIOCompletion;
5055
5056static void bdrv_co_io_em_complete(void *opaque, int ret)
5057{
5058 CoroutineIOCompletion *co = opaque;
5059
5060 co->ret = ret;
5061 qemu_coroutine_enter(co->coroutine, NULL);
5062}
5063
5064static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
5065 int nb_sectors, QEMUIOVector *iov,
5066 bool is_write)
5067{
5068 CoroutineIOCompletion co = {
5069 .coroutine = qemu_coroutine_self(),
5070 };
7c84b1b8 5071 BlockAIOCB *acb;
f9f05dc5
KW
5072
5073 if (is_write) {
a652d160
SH
5074 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
5075 bdrv_co_io_em_complete, &co);
f9f05dc5 5076 } else {
a652d160
SH
5077 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
5078 bdrv_co_io_em_complete, &co);
f9f05dc5
KW
5079 }
5080
59370aaa 5081 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
f9f05dc5
KW
5082 if (!acb) {
5083 return -EIO;
5084 }
5085 qemu_coroutine_yield();
5086
5087 return co.ret;
5088}
5089
5090static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
5091 int64_t sector_num, int nb_sectors,
5092 QEMUIOVector *iov)
5093{
5094 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
5095}
5096
5097static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
5098 int64_t sector_num, int nb_sectors,
5099 QEMUIOVector *iov)
5100{
5101 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
5102}
5103
07f07615 5104static void coroutine_fn bdrv_flush_co_entry(void *opaque)
e7a8a783 5105{
07f07615
PB
5106 RwCo *rwco = opaque;
5107
5108 rwco->ret = bdrv_co_flush(rwco->bs);
5109}
5110
5111int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
5112{
eb489bb1
KW
5113 int ret;
5114
29cdb251 5115 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
07f07615 5116 return 0;
eb489bb1
KW
5117 }
5118
ca716364 5119 /* Write back cached data to the OS even with cache=unsafe */
bf736fe3 5120 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
eb489bb1
KW
5121 if (bs->drv->bdrv_co_flush_to_os) {
5122 ret = bs->drv->bdrv_co_flush_to_os(bs);
5123 if (ret < 0) {
5124 return ret;
5125 }
5126 }
5127
ca716364
KW
5128 /* But don't actually force it to the disk with cache=unsafe */
5129 if (bs->open_flags & BDRV_O_NO_FLUSH) {
d4c82329 5130 goto flush_parent;
ca716364
KW
5131 }
5132
bf736fe3 5133 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
eb489bb1 5134 if (bs->drv->bdrv_co_flush_to_disk) {
29cdb251 5135 ret = bs->drv->bdrv_co_flush_to_disk(bs);
07f07615 5136 } else if (bs->drv->bdrv_aio_flush) {
7c84b1b8 5137 BlockAIOCB *acb;
07f07615
PB
5138 CoroutineIOCompletion co = {
5139 .coroutine = qemu_coroutine_self(),
5140 };
5141
5142 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
5143 if (acb == NULL) {
29cdb251 5144 ret = -EIO;
07f07615
PB
5145 } else {
5146 qemu_coroutine_yield();
29cdb251 5147 ret = co.ret;
07f07615 5148 }
07f07615
PB
5149 } else {
5150 /*
5151 * Some block drivers always operate in either writethrough or unsafe
5152 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
5153 * know how the server works (because the behaviour is hardcoded or
5154 * depends on server-side configuration), so we can't ensure that
5155 * everything is safe on disk. Returning an error doesn't work because
5156 * that would break guests even if the server operates in writethrough
5157 * mode.
5158 *
5159 * Let's hope the user knows what he's doing.
5160 */
29cdb251 5161 ret = 0;
07f07615 5162 }
29cdb251
PB
5163 if (ret < 0) {
5164 return ret;
5165 }
5166
5167 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
5168 * in the case of cache=unsafe, so there are no useless flushes.
5169 */
d4c82329 5170flush_parent:
29cdb251 5171 return bdrv_co_flush(bs->file);
07f07615
PB
5172}
5173
5a8a30db 5174void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
0f15423c 5175{
5a8a30db
KW
5176 Error *local_err = NULL;
5177 int ret;
5178
3456a8d1
KW
5179 if (!bs->drv) {
5180 return;
5181 }
5182
7ea2d269
AK
5183 if (!(bs->open_flags & BDRV_O_INCOMING)) {
5184 return;
5185 }
5186 bs->open_flags &= ~BDRV_O_INCOMING;
5187
3456a8d1 5188 if (bs->drv->bdrv_invalidate_cache) {
5a8a30db 5189 bs->drv->bdrv_invalidate_cache(bs, &local_err);
3456a8d1 5190 } else if (bs->file) {
5a8a30db
KW
5191 bdrv_invalidate_cache(bs->file, &local_err);
5192 }
5193 if (local_err) {
5194 error_propagate(errp, local_err);
5195 return;
0f15423c 5196 }
3456a8d1 5197
5a8a30db
KW
5198 ret = refresh_total_sectors(bs, bs->total_sectors);
5199 if (ret < 0) {
5200 error_setg_errno(errp, -ret, "Could not refresh total sector count");
5201 return;
5202 }
0f15423c
AL
5203}
5204
5a8a30db 5205void bdrv_invalidate_cache_all(Error **errp)
0f15423c
AL
5206{
5207 BlockDriverState *bs;
5a8a30db 5208 Error *local_err = NULL;
0f15423c 5209
dc364f4c 5210 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
5211 AioContext *aio_context = bdrv_get_aio_context(bs);
5212
5213 aio_context_acquire(aio_context);
5a8a30db 5214 bdrv_invalidate_cache(bs, &local_err);
ed78cda3 5215 aio_context_release(aio_context);
5a8a30db
KW
5216 if (local_err) {
5217 error_propagate(errp, local_err);
5218 return;
5219 }
0f15423c
AL
5220 }
5221}
5222
07f07615
PB
5223int bdrv_flush(BlockDriverState *bs)
5224{
5225 Coroutine *co;
5226 RwCo rwco = {
5227 .bs = bs,
5228 .ret = NOT_DONE,
e7a8a783 5229 };
e7a8a783 5230
07f07615
PB
5231 if (qemu_in_coroutine()) {
5232 /* Fast-path if already in coroutine context */
5233 bdrv_flush_co_entry(&rwco);
5234 } else {
2572b37a
SH
5235 AioContext *aio_context = bdrv_get_aio_context(bs);
5236
07f07615
PB
5237 co = qemu_coroutine_create(bdrv_flush_co_entry);
5238 qemu_coroutine_enter(co, &rwco);
5239 while (rwco.ret == NOT_DONE) {
2572b37a 5240 aio_poll(aio_context, true);
07f07615 5241 }
e7a8a783 5242 }
07f07615
PB
5243
5244 return rwco.ret;
e7a8a783
KW
5245}
5246
775aa8b6
KW
5247typedef struct DiscardCo {
5248 BlockDriverState *bs;
5249 int64_t sector_num;
5250 int nb_sectors;
5251 int ret;
5252} DiscardCo;
4265d620
PB
5253static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5254{
775aa8b6 5255 DiscardCo *rwco = opaque;
4265d620
PB
5256
5257 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5258}
5259
5260int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5261 int nb_sectors)
5262{
b9c64947 5263 int max_discard, ret;
d51e9fe5 5264
4265d620
PB
5265 if (!bs->drv) {
5266 return -ENOMEDIUM;
b9c64947
HR
5267 }
5268
5269 ret = bdrv_check_request(bs, sector_num, nb_sectors);
5270 if (ret < 0) {
5271 return ret;
4265d620
PB
5272 } else if (bs->read_only) {
5273 return -EROFS;
df702c9b
PB
5274 }
5275
e4654d2d 5276 bdrv_reset_dirty(bs, sector_num, nb_sectors);
df702c9b 5277
9e8f1835
PB
5278 /* Do nothing if disabled. */
5279 if (!(bs->open_flags & BDRV_O_UNMAP)) {
5280 return 0;
5281 }
5282
d51e9fe5
PB
5283 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5284 return 0;
5285 }
6f14da52 5286
75af1f34 5287 max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
d51e9fe5
PB
5288 while (nb_sectors > 0) {
5289 int ret;
5290 int num = nb_sectors;
6f14da52 5291
d51e9fe5
PB
5292 /* align request */
5293 if (bs->bl.discard_alignment &&
5294 num >= bs->bl.discard_alignment &&
5295 sector_num % bs->bl.discard_alignment) {
5296 if (num > bs->bl.discard_alignment) {
5297 num = bs->bl.discard_alignment;
6f14da52 5298 }
d51e9fe5
PB
5299 num -= sector_num % bs->bl.discard_alignment;
5300 }
6f14da52 5301
d51e9fe5
PB
5302 /* limit request size */
5303 if (num > max_discard) {
5304 num = max_discard;
5305 }
6f14da52 5306
d51e9fe5 5307 if (bs->drv->bdrv_co_discard) {
6f14da52 5308 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
d51e9fe5 5309 } else {
7c84b1b8 5310 BlockAIOCB *acb;
d51e9fe5
PB
5311 CoroutineIOCompletion co = {
5312 .coroutine = qemu_coroutine_self(),
5313 };
5314
5315 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5316 bdrv_co_io_em_complete, &co);
5317 if (acb == NULL) {
5318 return -EIO;
5319 } else {
5320 qemu_coroutine_yield();
5321 ret = co.ret;
6f14da52 5322 }
6f14da52 5323 }
7ce21016 5324 if (ret && ret != -ENOTSUP) {
d51e9fe5 5325 return ret;
4265d620 5326 }
d51e9fe5
PB
5327
5328 sector_num += num;
5329 nb_sectors -= num;
4265d620 5330 }
d51e9fe5 5331 return 0;
4265d620
PB
5332}
5333
5334int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5335{
5336 Coroutine *co;
775aa8b6 5337 DiscardCo rwco = {
4265d620
PB
5338 .bs = bs,
5339 .sector_num = sector_num,
5340 .nb_sectors = nb_sectors,
5341 .ret = NOT_DONE,
5342 };
5343
5344 if (qemu_in_coroutine()) {
5345 /* Fast-path if already in coroutine context */
5346 bdrv_discard_co_entry(&rwco);
5347 } else {
2572b37a
SH
5348 AioContext *aio_context = bdrv_get_aio_context(bs);
5349
4265d620
PB
5350 co = qemu_coroutine_create(bdrv_discard_co_entry);
5351 qemu_coroutine_enter(co, &rwco);
5352 while (rwco.ret == NOT_DONE) {
2572b37a 5353 aio_poll(aio_context, true);
4265d620
PB
5354 }
5355 }
5356
5357 return rwco.ret;
5358}
5359
19cb3738
FB
5360/**************************************************************/
5361/* removable device support */
5362
5363/**
5364 * Return TRUE if the media is present
5365 */
5366int bdrv_is_inserted(BlockDriverState *bs)
5367{
5368 BlockDriver *drv = bs->drv;
a1aff5bf 5369
19cb3738
FB
5370 if (!drv)
5371 return 0;
5372 if (!drv->bdrv_is_inserted)
a1aff5bf
MA
5373 return 1;
5374 return drv->bdrv_is_inserted(bs);
19cb3738
FB
5375}
5376
5377/**
8e49ca46
MA
5378 * Return whether the media changed since the last call to this
5379 * function, or -ENOTSUP if we don't know. Most drivers don't know.
19cb3738
FB
5380 */
5381int bdrv_media_changed(BlockDriverState *bs)
5382{
5383 BlockDriver *drv = bs->drv;
19cb3738 5384
8e49ca46
MA
5385 if (drv && drv->bdrv_media_changed) {
5386 return drv->bdrv_media_changed(bs);
5387 }
5388 return -ENOTSUP;
19cb3738
FB
5389}
5390
5391/**
5392 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5393 */
f36f3949 5394void bdrv_eject(BlockDriverState *bs, bool eject_flag)
19cb3738
FB
5395{
5396 BlockDriver *drv = bs->drv;
bfb197e0 5397 const char *device_name;
19cb3738 5398
822e1cd1
MA
5399 if (drv && drv->bdrv_eject) {
5400 drv->bdrv_eject(bs, eject_flag);
19cb3738 5401 }
6f382ed2 5402
bfb197e0
MA
5403 device_name = bdrv_get_device_name(bs);
5404 if (device_name[0] != '\0') {
5405 qapi_event_send_device_tray_moved(device_name,
a5ee7bd4 5406 eject_flag, &error_abort);
6f382ed2 5407 }
19cb3738
FB
5408}
5409
19cb3738
FB
5410/**
5411 * Lock or unlock the media (if it is locked, the user won't be able
5412 * to eject it manually).
5413 */
025e849a 5414void bdrv_lock_medium(BlockDriverState *bs, bool locked)
19cb3738
FB
5415{
5416 BlockDriver *drv = bs->drv;
5417
025e849a 5418 trace_bdrv_lock_medium(bs, locked);
b8c6d095 5419
025e849a
MA
5420 if (drv && drv->bdrv_lock_medium) {
5421 drv->bdrv_lock_medium(bs, locked);
19cb3738
FB
5422 }
5423}
985a03b0
TS
5424
5425/* needed for generic scsi interface */
5426
5427int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5428{
5429 BlockDriver *drv = bs->drv;
5430
5431 if (drv && drv->bdrv_ioctl)
5432 return drv->bdrv_ioctl(bs, req, buf);
5433 return -ENOTSUP;
5434}
7d780669 5435
7c84b1b8 5436BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
221f715d 5437 unsigned long int req, void *buf,
097310b5 5438 BlockCompletionFunc *cb, void *opaque)
7d780669 5439{
221f715d 5440 BlockDriver *drv = bs->drv;
7d780669 5441
221f715d
AL
5442 if (drv && drv->bdrv_aio_ioctl)
5443 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5444 return NULL;
7d780669 5445}
e268ca52 5446
1b7fd729 5447void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
7b6f9300 5448{
1b7fd729 5449 bs->guest_block_size = align;
7b6f9300 5450}
7cd1e32a 5451
e268ca52
AL
5452void *qemu_blockalign(BlockDriverState *bs, size_t size)
5453{
339064d5 5454 return qemu_memalign(bdrv_opt_mem_align(bs), size);
e268ca52 5455}
7cd1e32a 5456
9ebd8448
HR
5457void *qemu_blockalign0(BlockDriverState *bs, size_t size)
5458{
5459 return memset(qemu_blockalign(bs, size), 0, size);
5460}
5461
7d2a35cc
KW
5462void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5463{
5464 size_t align = bdrv_opt_mem_align(bs);
5465
5466 /* Ensure that NULL is never returned on success */
5467 assert(align > 0);
5468 if (size == 0) {
5469 size = align;
5470 }
5471
5472 return qemu_try_memalign(align, size);
5473}
5474
9ebd8448
HR
5475void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
5476{
5477 void *mem = qemu_try_blockalign(bs, size);
5478
5479 if (mem) {
5480 memset(mem, 0, size);
5481 }
5482
5483 return mem;
5484}
5485
c53b1c51
SH
5486/*
5487 * Check if all memory in this vector is sector aligned.
5488 */
5489bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5490{
5491 int i;
339064d5 5492 size_t alignment = bdrv_opt_mem_align(bs);
c53b1c51
SH
5493
5494 for (i = 0; i < qiov->niov; i++) {
339064d5 5495 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
c53b1c51 5496 return false;
1ff735bd 5497 }
339064d5 5498 if (qiov->iov[i].iov_len % alignment) {
1ff735bd 5499 return false;
c53b1c51
SH
5500 }
5501 }
5502
5503 return true;
5504}
5505
0db6e54a
FZ
5506BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name)
5507{
5508 BdrvDirtyBitmap *bm;
5509
5510 assert(name);
5511 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5512 if (bm->name && !strcmp(name, bm->name)) {
5513 return bm;
5514 }
5515 }
5516 return NULL;
5517}
5518
5519void bdrv_dirty_bitmap_make_anon(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5520{
5521 g_free(bitmap->name);
5522 bitmap->name = NULL;
5523}
5524
5525BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
5fba6c0e 5526 uint32_t granularity,
0db6e54a 5527 const char *name,
b8afb520 5528 Error **errp)
7cd1e32a
LS
5529{
5530 int64_t bitmap_size;
e4654d2d 5531 BdrvDirtyBitmap *bitmap;
5fba6c0e 5532 uint32_t sector_granularity;
a55eb92c 5533
50717e94
PB
5534 assert((granularity & (granularity - 1)) == 0);
5535
0db6e54a
FZ
5536 if (name && bdrv_find_dirty_bitmap(bs, name)) {
5537 error_setg(errp, "Bitmap already exists: %s", name);
5538 return NULL;
5539 }
5fba6c0e
JS
5540 sector_granularity = granularity >> BDRV_SECTOR_BITS;
5541 assert(sector_granularity);
57322b78 5542 bitmap_size = bdrv_nb_sectors(bs);
b8afb520
FZ
5543 if (bitmap_size < 0) {
5544 error_setg_errno(errp, -bitmap_size, "could not get length of device");
5545 errno = -bitmap_size;
5546 return NULL;
5547 }
5839e53b 5548 bitmap = g_new0(BdrvDirtyBitmap, 1);
5fba6c0e 5549 bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(sector_granularity));
0db6e54a 5550 bitmap->name = g_strdup(name);
b8e6fb75 5551 bitmap->disabled = false;
e4654d2d
FZ
5552 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5553 return bitmap;
5554}
5555
b8e6fb75
JS
5556bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap)
5557{
5558 return !bitmap->disabled;
5559}
5560
e4654d2d
FZ
5561void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5562{
5563 BdrvDirtyBitmap *bm, *next;
5564 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5565 if (bm == bitmap) {
5566 QLIST_REMOVE(bitmap, list);
5567 hbitmap_free(bitmap->bitmap);
0db6e54a 5568 g_free(bitmap->name);
e4654d2d
FZ
5569 g_free(bitmap);
5570 return;
a55eb92c 5571 }
7cd1e32a
LS
5572 }
5573}
5574
b8e6fb75
JS
5575void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap)
5576{
5577 bitmap->disabled = true;
5578}
5579
5580void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap)
5581{
5582 bitmap->disabled = false;
5583}
5584
21b56835
FZ
5585BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5586{
5587 BdrvDirtyBitmap *bm;
5588 BlockDirtyInfoList *list = NULL;
5589 BlockDirtyInfoList **plist = &list;
5590
5591 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5839e53b
MA
5592 BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5593 BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
21b56835 5594 info->count = bdrv_get_dirty_count(bs, bm);
592fdd02 5595 info->granularity = bdrv_dirty_bitmap_granularity(bm);
0db6e54a
FZ
5596 info->has_name = !!bm->name;
5597 info->name = g_strdup(bm->name);
21b56835
FZ
5598 entry->value = info;
5599 *plist = entry;
5600 plist = &entry->next;
5601 }
5602
5603 return list;
5604}
5605
e4654d2d 5606int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
7cd1e32a 5607{
e4654d2d
FZ
5608 if (bitmap) {
5609 return hbitmap_get(bitmap->bitmap, sector);
7cd1e32a
LS
5610 } else {
5611 return 0;
5612 }
5613}
5614
341ebc2f
JS
5615/**
5616 * Chooses a default granularity based on the existing cluster size,
5617 * but clamped between [4K, 64K]. Defaults to 64K in the case that there
5618 * is no cluster size information available.
5619 */
5620uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs)
5621{
5622 BlockDriverInfo bdi;
5623 uint32_t granularity;
5624
5625 if (bdrv_get_info(bs, &bdi) >= 0 && bdi.cluster_size > 0) {
5626 granularity = MAX(4096, bdi.cluster_size);
5627 granularity = MIN(65536, granularity);
5628 } else {
5629 granularity = 65536;
5630 }
5631
5632 return granularity;
5633}
5634
592fdd02
JS
5635uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap)
5636{
5637 return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap);
5638}
5639
e4654d2d
FZ
5640void bdrv_dirty_iter_init(BlockDriverState *bs,
5641 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
1755da16 5642{
e4654d2d 5643 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
1755da16
PB
5644}
5645
c4237dfa
VSO
5646void bdrv_set_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
5647 int64_t cur_sector, int nr_sectors)
5648{
b8e6fb75 5649 assert(bdrv_dirty_bitmap_enabled(bitmap));
c4237dfa
VSO
5650 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5651}
5652
5653void bdrv_reset_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
5654 int64_t cur_sector, int nr_sectors)
5655{
b8e6fb75 5656 assert(bdrv_dirty_bitmap_enabled(bitmap));
c4237dfa
VSO
5657 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5658}
5659
5660static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5661 int nr_sectors)
1755da16 5662{
e4654d2d
FZ
5663 BdrvDirtyBitmap *bitmap;
5664 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
b8e6fb75
JS
5665 if (!bdrv_dirty_bitmap_enabled(bitmap)) {
5666 continue;
5667 }
e4654d2d
FZ
5668 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5669 }
1755da16
PB
5670}
5671
c4237dfa
VSO
5672static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
5673 int nr_sectors)
7cd1e32a 5674{
e4654d2d
FZ
5675 BdrvDirtyBitmap *bitmap;
5676 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
b8e6fb75
JS
5677 if (!bdrv_dirty_bitmap_enabled(bitmap)) {
5678 continue;
5679 }
e4654d2d
FZ
5680 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5681 }
7cd1e32a 5682}
aaa0eb75 5683
e4654d2d 5684int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
aaa0eb75 5685{
e4654d2d 5686 return hbitmap_count(bitmap->bitmap);
aaa0eb75 5687}
f88e1a42 5688
9fcb0251
FZ
5689/* Get a reference to bs */
5690void bdrv_ref(BlockDriverState *bs)
5691{
5692 bs->refcnt++;
5693}
5694
5695/* Release a previously grabbed reference to bs.
5696 * If after releasing, reference count is zero, the BlockDriverState is
5697 * deleted. */
5698void bdrv_unref(BlockDriverState *bs)
5699{
9a4d5ca6
JC
5700 if (!bs) {
5701 return;
5702 }
9fcb0251
FZ
5703 assert(bs->refcnt > 0);
5704 if (--bs->refcnt == 0) {
5705 bdrv_delete(bs);
5706 }
5707}
5708
fbe40ff7
FZ
5709struct BdrvOpBlocker {
5710 Error *reason;
5711 QLIST_ENTRY(BdrvOpBlocker) list;
5712};
5713
5714bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5715{
5716 BdrvOpBlocker *blocker;
5717 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5718 if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5719 blocker = QLIST_FIRST(&bs->op_blockers[op]);
5720 if (errp) {
81e5f78a
AG
5721 error_setg(errp, "Node '%s' is busy: %s",
5722 bdrv_get_device_or_node_name(bs),
bfb197e0 5723 error_get_pretty(blocker->reason));
fbe40ff7
FZ
5724 }
5725 return true;
5726 }
5727 return false;
5728}
5729
5730void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5731{
5732 BdrvOpBlocker *blocker;
5733 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5734
5839e53b 5735 blocker = g_new0(BdrvOpBlocker, 1);
fbe40ff7
FZ
5736 blocker->reason = reason;
5737 QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5738}
5739
5740void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5741{
5742 BdrvOpBlocker *blocker, *next;
5743 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5744 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5745 if (blocker->reason == reason) {
5746 QLIST_REMOVE(blocker, list);
5747 g_free(blocker);
5748 }
5749 }
5750}
5751
5752void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5753{
5754 int i;
5755 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5756 bdrv_op_block(bs, i, reason);
5757 }
5758}
5759
5760void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5761{
5762 int i;
5763 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5764 bdrv_op_unblock(bs, i, reason);
5765 }
5766}
5767
5768bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5769{
5770 int i;
5771
5772 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5773 if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5774 return false;
5775 }
5776 }
5777 return true;
5778}
5779
28a7282a
LC
5780void bdrv_iostatus_enable(BlockDriverState *bs)
5781{
d6bf279e 5782 bs->iostatus_enabled = true;
58e21ef5 5783 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
28a7282a
LC
5784}
5785
5786/* The I/O status is only enabled if the drive explicitly
5787 * enables it _and_ the VM is configured to stop on errors */
5788bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5789{
d6bf279e 5790 return (bs->iostatus_enabled &&
92aa5c6d
PB
5791 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5792 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5793 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
28a7282a
LC
5794}
5795
5796void bdrv_iostatus_disable(BlockDriverState *bs)
5797{
d6bf279e 5798 bs->iostatus_enabled = false;
28a7282a
LC
5799}
5800
5801void bdrv_iostatus_reset(BlockDriverState *bs)
5802{
5803 if (bdrv_iostatus_is_enabled(bs)) {
58e21ef5 5804 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3bd293c3
PB
5805 if (bs->job) {
5806 block_job_iostatus_reset(bs->job);
5807 }
28a7282a
LC
5808 }
5809}
5810
28a7282a
LC
5811void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5812{
3e1caa5f
PB
5813 assert(bdrv_iostatus_is_enabled(bs));
5814 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
58e21ef5
LC
5815 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5816 BLOCK_DEVICE_IO_STATUS_FAILED;
28a7282a
LC
5817 }
5818}
5819
d92ada22
LC
5820void bdrv_img_create(const char *filename, const char *fmt,
5821 const char *base_filename, const char *base_fmt,
f382d43a
MR
5822 char *options, uint64_t img_size, int flags,
5823 Error **errp, bool quiet)
f88e1a42 5824{
83d0521a
CL
5825 QemuOptsList *create_opts = NULL;
5826 QemuOpts *opts = NULL;
5827 const char *backing_fmt, *backing_file;
5828 int64_t size;
f88e1a42 5829 BlockDriver *drv, *proto_drv;
96df67d1 5830 BlockDriver *backing_drv = NULL;
cc84d90f 5831 Error *local_err = NULL;
f88e1a42
JS
5832 int ret = 0;
5833
5834 /* Find driver and parse its options */
5835 drv = bdrv_find_format(fmt);
5836 if (!drv) {
71c79813 5837 error_setg(errp, "Unknown file format '%s'", fmt);
d92ada22 5838 return;
f88e1a42
JS
5839 }
5840
b65a5e12 5841 proto_drv = bdrv_find_protocol(filename, true, errp);
f88e1a42 5842 if (!proto_drv) {
d92ada22 5843 return;
f88e1a42
JS
5844 }
5845
c6149724
HR
5846 if (!drv->create_opts) {
5847 error_setg(errp, "Format driver '%s' does not support image creation",
5848 drv->format_name);
5849 return;
5850 }
5851
5852 if (!proto_drv->create_opts) {
5853 error_setg(errp, "Protocol driver '%s' does not support image creation",
5854 proto_drv->format_name);
5855 return;
5856 }
5857
c282e1fd
CL
5858 create_opts = qemu_opts_append(create_opts, drv->create_opts);
5859 create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
f88e1a42
JS
5860
5861 /* Create parameter list with default values */
83d0521a 5862 opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
39101f25 5863 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size, &error_abort);
f88e1a42
JS
5864
5865 /* Parse -o options */
5866 if (options) {
dc523cd3
MA
5867 qemu_opts_do_parse(opts, options, NULL, &local_err);
5868 if (local_err) {
5869 error_report_err(local_err);
5870 local_err = NULL;
83d0521a 5871 error_setg(errp, "Invalid options for file format '%s'", fmt);
f88e1a42
JS
5872 goto out;
5873 }
5874 }
5875
5876 if (base_filename) {
f43e47db 5877 qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename, &local_err);
6be4194b 5878 if (local_err) {
71c79813
LC
5879 error_setg(errp, "Backing file not supported for file format '%s'",
5880 fmt);
f88e1a42
JS
5881 goto out;
5882 }
5883 }
5884
5885 if (base_fmt) {
f43e47db 5886 qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt, &local_err);
6be4194b 5887 if (local_err) {
71c79813
LC
5888 error_setg(errp, "Backing file format not supported for file "
5889 "format '%s'", fmt);
f88e1a42
JS
5890 goto out;
5891 }
5892 }
5893
83d0521a
CL
5894 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5895 if (backing_file) {
5896 if (!strcmp(filename, backing_file)) {
71c79813
LC
5897 error_setg(errp, "Error: Trying to create an image with the "
5898 "same filename as the backing file");
792da93a
JS
5899 goto out;
5900 }
5901 }
5902
83d0521a
CL
5903 backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5904 if (backing_fmt) {
5905 backing_drv = bdrv_find_format(backing_fmt);
96df67d1 5906 if (!backing_drv) {
71c79813 5907 error_setg(errp, "Unknown backing file format '%s'",
83d0521a 5908 backing_fmt);
f88e1a42
JS
5909 goto out;
5910 }
5911 }
5912
5913 // The size for the image must always be specified, with one exception:
5914 // If we are using a backing file, we can obtain the size from there
83d0521a
CL
5915 size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5916 if (size == -1) {
5917 if (backing_file) {
66f6b814 5918 BlockDriverState *bs;
29168018 5919 char *full_backing = g_new0(char, PATH_MAX);
52bf1e72 5920 int64_t size;
63090dac
PB
5921 int back_flags;
5922
29168018
HR
5923 bdrv_get_full_backing_filename_from_filename(filename, backing_file,
5924 full_backing, PATH_MAX,
5925 &local_err);
5926 if (local_err) {
5927 g_free(full_backing);
5928 goto out;
5929 }
5930
63090dac
PB
5931 /* backing files always opened read-only */
5932 back_flags =
5933 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
f88e1a42 5934
f67503e5 5935 bs = NULL;
29168018 5936 ret = bdrv_open(&bs, full_backing, NULL, NULL, back_flags,
cc84d90f 5937 backing_drv, &local_err);
29168018 5938 g_free(full_backing);
f88e1a42 5939 if (ret < 0) {
f88e1a42
JS
5940 goto out;
5941 }
52bf1e72
MA
5942 size = bdrv_getlength(bs);
5943 if (size < 0) {
5944 error_setg_errno(errp, -size, "Could not get size of '%s'",
5945 backing_file);
5946 bdrv_unref(bs);
5947 goto out;
5948 }
f88e1a42 5949
39101f25 5950 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size, &error_abort);
66f6b814
HR
5951
5952 bdrv_unref(bs);
f88e1a42 5953 } else {
71c79813 5954 error_setg(errp, "Image creation needs a size parameter");
f88e1a42
JS
5955 goto out;
5956 }
5957 }
5958
f382d43a 5959 if (!quiet) {
43c5d8f8
FZ
5960 printf("Formatting '%s', fmt=%s", filename, fmt);
5961 qemu_opts_print(opts, " ");
f382d43a
MR
5962 puts("");
5963 }
83d0521a 5964
c282e1fd 5965 ret = bdrv_create(drv, filename, opts, &local_err);
83d0521a 5966
cc84d90f
HR
5967 if (ret == -EFBIG) {
5968 /* This is generally a better message than whatever the driver would
5969 * deliver (especially because of the cluster_size_hint), since that
5970 * is most probably not much different from "image too large". */
5971 const char *cluster_size_hint = "";
83d0521a 5972 if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
cc84d90f 5973 cluster_size_hint = " (try using a larger cluster size)";
f88e1a42 5974 }
cc84d90f
HR
5975 error_setg(errp, "The image size is too large for file format '%s'"
5976 "%s", fmt, cluster_size_hint);
5977 error_free(local_err);
5978 local_err = NULL;
f88e1a42
JS
5979 }
5980
5981out:
83d0521a
CL
5982 qemu_opts_del(opts);
5983 qemu_opts_free(create_opts);
84d18f06 5984 if (local_err) {
cc84d90f
HR
5985 error_propagate(errp, local_err);
5986 }
f88e1a42 5987}
85d126f3
SH
5988
5989AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5990{
dcd04228
SH
5991 return bs->aio_context;
5992}
5993
5994void bdrv_detach_aio_context(BlockDriverState *bs)
5995{
33384421
HR
5996 BdrvAioNotifier *baf;
5997
dcd04228
SH
5998 if (!bs->drv) {
5999 return;
6000 }
6001
33384421
HR
6002 QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
6003 baf->detach_aio_context(baf->opaque);
6004 }
6005
13af91eb
SH
6006 if (bs->io_limits_enabled) {
6007 throttle_detach_aio_context(&bs->throttle_state);
6008 }
dcd04228
SH
6009 if (bs->drv->bdrv_detach_aio_context) {
6010 bs->drv->bdrv_detach_aio_context(bs);
6011 }
6012 if (bs->file) {
6013 bdrv_detach_aio_context(bs->file);
6014 }
6015 if (bs->backing_hd) {
6016 bdrv_detach_aio_context(bs->backing_hd);
6017 }
6018
6019 bs->aio_context = NULL;
6020}
6021
6022void bdrv_attach_aio_context(BlockDriverState *bs,
6023 AioContext *new_context)
6024{
33384421
HR
6025 BdrvAioNotifier *ban;
6026
dcd04228
SH
6027 if (!bs->drv) {
6028 return;
6029 }
6030
6031 bs->aio_context = new_context;
6032
6033 if (bs->backing_hd) {
6034 bdrv_attach_aio_context(bs->backing_hd, new_context);
6035 }
6036 if (bs->file) {
6037 bdrv_attach_aio_context(bs->file, new_context);
6038 }
6039 if (bs->drv->bdrv_attach_aio_context) {
6040 bs->drv->bdrv_attach_aio_context(bs, new_context);
6041 }
13af91eb
SH
6042 if (bs->io_limits_enabled) {
6043 throttle_attach_aio_context(&bs->throttle_state, new_context);
6044 }
33384421
HR
6045
6046 QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
6047 ban->attached_aio_context(new_context, ban->opaque);
6048 }
dcd04228
SH
6049}
6050
6051void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
6052{
6053 bdrv_drain_all(); /* ensure there are no in-flight requests */
6054
6055 bdrv_detach_aio_context(bs);
6056
6057 /* This function executes in the old AioContext so acquire the new one in
6058 * case it runs in a different thread.
6059 */
6060 aio_context_acquire(new_context);
6061 bdrv_attach_aio_context(bs, new_context);
6062 aio_context_release(new_context);
85d126f3 6063}
d616b224 6064
33384421
HR
6065void bdrv_add_aio_context_notifier(BlockDriverState *bs,
6066 void (*attached_aio_context)(AioContext *new_context, void *opaque),
6067 void (*detach_aio_context)(void *opaque), void *opaque)
6068{
6069 BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
6070 *ban = (BdrvAioNotifier){
6071 .attached_aio_context = attached_aio_context,
6072 .detach_aio_context = detach_aio_context,
6073 .opaque = opaque
6074 };
6075
6076 QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
6077}
6078
6079void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
6080 void (*attached_aio_context)(AioContext *,
6081 void *),
6082 void (*detach_aio_context)(void *),
6083 void *opaque)
6084{
6085 BdrvAioNotifier *ban, *ban_next;
6086
6087 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
6088 if (ban->attached_aio_context == attached_aio_context &&
6089 ban->detach_aio_context == detach_aio_context &&
6090 ban->opaque == opaque)
6091 {
6092 QLIST_REMOVE(ban, list);
6093 g_free(ban);
6094
6095 return;
6096 }
6097 }
6098
6099 abort();
6100}
6101
d616b224
SH
6102void bdrv_add_before_write_notifier(BlockDriverState *bs,
6103 NotifierWithReturn *notifier)
6104{
6105 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
6106}
6f176b48 6107
77485434
HR
6108int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
6109 BlockDriverAmendStatusCB *status_cb)
6f176b48 6110{
c282e1fd 6111 if (!bs->drv->bdrv_amend_options) {
6f176b48
HR
6112 return -ENOTSUP;
6113 }
77485434 6114 return bs->drv->bdrv_amend_options(bs, opts, status_cb);
6f176b48 6115}
f6186f49 6116
b5042a36
BC
6117/* This function will be called by the bdrv_recurse_is_first_non_filter method
6118 * of block filter and by bdrv_is_first_non_filter.
6119 * It is used to test if the given bs is the candidate or recurse more in the
6120 * node graph.
212a5a8f 6121 */
b5042a36 6122bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
212a5a8f 6123 BlockDriverState *candidate)
f6186f49 6124{
b5042a36
BC
6125 /* return false if basic checks fails */
6126 if (!bs || !bs->drv) {
212a5a8f 6127 return false;
f6186f49
BC
6128 }
6129
b5042a36
BC
6130 /* the code reached a non block filter driver -> check if the bs is
6131 * the same as the candidate. It's the recursion termination condition.
6132 */
6133 if (!bs->drv->is_filter) {
6134 return bs == candidate;
212a5a8f 6135 }
b5042a36 6136 /* Down this path the driver is a block filter driver */
212a5a8f 6137
b5042a36
BC
6138 /* If the block filter recursion method is defined use it to recurse down
6139 * the node graph.
6140 */
6141 if (bs->drv->bdrv_recurse_is_first_non_filter) {
212a5a8f 6142 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
f6186f49
BC
6143 }
6144
b5042a36
BC
6145 /* the driver is a block filter but don't allow to recurse -> return false
6146 */
6147 return false;
f6186f49
BC
6148}
6149
212a5a8f
BC
6150/* This function checks if the candidate is the first non filter bs down it's
6151 * bs chain. Since we don't have pointers to parents it explore all bs chains
6152 * from the top. Some filters can choose not to pass down the recursion.
6153 */
6154bool bdrv_is_first_non_filter(BlockDriverState *candidate)
f6186f49 6155{
212a5a8f
BC
6156 BlockDriverState *bs;
6157
6158 /* walk down the bs forest recursively */
6159 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
6160 bool perm;
6161
b5042a36 6162 /* try to recurse in this top level bs */
e6dc8a1f 6163 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
212a5a8f
BC
6164
6165 /* candidate is the first non filter */
6166 if (perm) {
6167 return true;
6168 }
6169 }
6170
6171 return false;
f6186f49 6172}
09158f00
BC
6173
6174BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
6175{
6176 BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
5a7e7a0b
SH
6177 AioContext *aio_context;
6178
09158f00
BC
6179 if (!to_replace_bs) {
6180 error_setg(errp, "Node name '%s' not found", node_name);
6181 return NULL;
6182 }
6183
5a7e7a0b
SH
6184 aio_context = bdrv_get_aio_context(to_replace_bs);
6185 aio_context_acquire(aio_context);
6186
09158f00 6187 if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
5a7e7a0b
SH
6188 to_replace_bs = NULL;
6189 goto out;
09158f00
BC
6190 }
6191
6192 /* We don't want arbitrary node of the BDS chain to be replaced only the top
6193 * most non filter in order to prevent data corruption.
6194 * Another benefit is that this tests exclude backing files which are
6195 * blocked by the backing blockers.
6196 */
6197 if (!bdrv_is_first_non_filter(to_replace_bs)) {
6198 error_setg(errp, "Only top most non filter can be replaced");
5a7e7a0b
SH
6199 to_replace_bs = NULL;
6200 goto out;
09158f00
BC
6201 }
6202
5a7e7a0b
SH
6203out:
6204 aio_context_release(aio_context);
09158f00
BC
6205 return to_replace_bs;
6206}
448ad91d
ML
6207
6208void bdrv_io_plug(BlockDriverState *bs)
6209{
6210 BlockDriver *drv = bs->drv;
6211 if (drv && drv->bdrv_io_plug) {
6212 drv->bdrv_io_plug(bs);
6213 } else if (bs->file) {
6214 bdrv_io_plug(bs->file);
6215 }
6216}
6217
6218void bdrv_io_unplug(BlockDriverState *bs)
6219{
6220 BlockDriver *drv = bs->drv;
6221 if (drv && drv->bdrv_io_unplug) {
6222 drv->bdrv_io_unplug(bs);
6223 } else if (bs->file) {
6224 bdrv_io_unplug(bs->file);
6225 }
6226}
6227
6228void bdrv_flush_io_queue(BlockDriverState *bs)
6229{
6230 BlockDriver *drv = bs->drv;
6231 if (drv && drv->bdrv_flush_io_queue) {
6232 drv->bdrv_flush_io_queue(bs);
6233 } else if (bs->file) {
6234 bdrv_flush_io_queue(bs->file);
6235 }
6236}
91af7014
HR
6237
6238static bool append_open_options(QDict *d, BlockDriverState *bs)
6239{
6240 const QDictEntry *entry;
6241 bool found_any = false;
6242
6243 for (entry = qdict_first(bs->options); entry;
6244 entry = qdict_next(bs->options, entry))
6245 {
6246 /* Only take options for this level and exclude all non-driver-specific
6247 * options */
6248 if (!strchr(qdict_entry_key(entry), '.') &&
6249 strcmp(qdict_entry_key(entry), "node-name"))
6250 {
6251 qobject_incref(qdict_entry_value(entry));
6252 qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
6253 found_any = true;
6254 }
6255 }
6256
6257 return found_any;
6258}
6259
6260/* Updates the following BDS fields:
6261 * - exact_filename: A filename which may be used for opening a block device
6262 * which (mostly) equals the given BDS (even without any
6263 * other options; so reading and writing must return the same
6264 * results, but caching etc. may be different)
6265 * - full_open_options: Options which, when given when opening a block device
6266 * (without a filename), result in a BDS (mostly)
6267 * equalling the given one
6268 * - filename: If exact_filename is set, it is copied here. Otherwise,
6269 * full_open_options is converted to a JSON object, prefixed with
6270 * "json:" (for use through the JSON pseudo protocol) and put here.
6271 */
6272void bdrv_refresh_filename(BlockDriverState *bs)
6273{
6274 BlockDriver *drv = bs->drv;
6275 QDict *opts;
6276
6277 if (!drv) {
6278 return;
6279 }
6280
6281 /* This BDS's file name will most probably depend on its file's name, so
6282 * refresh that first */
6283 if (bs->file) {
6284 bdrv_refresh_filename(bs->file);
6285 }
6286
6287 if (drv->bdrv_refresh_filename) {
6288 /* Obsolete information is of no use here, so drop the old file name
6289 * information before refreshing it */
6290 bs->exact_filename[0] = '\0';
6291 if (bs->full_open_options) {
6292 QDECREF(bs->full_open_options);
6293 bs->full_open_options = NULL;
6294 }
6295
6296 drv->bdrv_refresh_filename(bs);
6297 } else if (bs->file) {
6298 /* Try to reconstruct valid information from the underlying file */
6299 bool has_open_options;
6300
6301 bs->exact_filename[0] = '\0';
6302 if (bs->full_open_options) {
6303 QDECREF(bs->full_open_options);
6304 bs->full_open_options = NULL;
6305 }
6306
6307 opts = qdict_new();
6308 has_open_options = append_open_options(opts, bs);
6309
6310 /* If no specific options have been given for this BDS, the filename of
6311 * the underlying file should suffice for this one as well */
6312 if (bs->file->exact_filename[0] && !has_open_options) {
6313 strcpy(bs->exact_filename, bs->file->exact_filename);
6314 }
6315 /* Reconstructing the full options QDict is simple for most format block
6316 * drivers, as long as the full options are known for the underlying
6317 * file BDS. The full options QDict of that file BDS should somehow
6318 * contain a representation of the filename, therefore the following
6319 * suffices without querying the (exact_)filename of this BDS. */
6320 if (bs->file->full_open_options) {
6321 qdict_put_obj(opts, "driver",
6322 QOBJECT(qstring_from_str(drv->format_name)));
6323 QINCREF(bs->file->full_open_options);
6324 qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
6325
6326 bs->full_open_options = opts;
6327 } else {
6328 QDECREF(opts);
6329 }
6330 } else if (!bs->full_open_options && qdict_size(bs->options)) {
6331 /* There is no underlying file BDS (at least referenced by BDS.file),
6332 * so the full options QDict should be equal to the options given
6333 * specifically for this block device when it was opened (plus the
6334 * driver specification).
6335 * Because those options don't change, there is no need to update
6336 * full_open_options when it's already set. */
6337
6338 opts = qdict_new();
6339 append_open_options(opts, bs);
6340 qdict_put_obj(opts, "driver",
6341 QOBJECT(qstring_from_str(drv->format_name)));
6342
6343 if (bs->exact_filename[0]) {
6344 /* This may not work for all block protocol drivers (some may
6345 * require this filename to be parsed), but we have to find some
6346 * default solution here, so just include it. If some block driver
6347 * does not support pure options without any filename at all or
6348 * needs some special format of the options QDict, it needs to
6349 * implement the driver-specific bdrv_refresh_filename() function.
6350 */
6351 qdict_put_obj(opts, "filename",
6352 QOBJECT(qstring_from_str(bs->exact_filename)));
6353 }
6354
6355 bs->full_open_options = opts;
6356 }
6357
6358 if (bs->exact_filename[0]) {
6359 pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
6360 } else if (bs->full_open_options) {
6361 QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6362 snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6363 qstring_get_str(json));
6364 QDECREF(json);
6365 }
6366}
5366d0c8
BC
6367
6368/* This accessor function purpose is to allow the device models to access the
6369 * BlockAcctStats structure embedded inside a BlockDriverState without being
6370 * aware of the BlockDriverState structure layout.
6371 * It will go away when the BlockAcctStats structure will be moved inside
6372 * the device models.
6373 */
6374BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
6375{
6376 return &bs->stats;
6377}