]> git.proxmox.com Git - mirror_qemu.git/blame - block.c
qapi: Add optional field "name" to block dirty bitmap
[mirror_qemu.git] / block.c
CommitLineData
fc01f7e7
FB
1/*
2 * QEMU System Emulator block driver
5fafdf24 3 *
fc01f7e7 4 * Copyright (c) 2003 Fabrice Bellard
5fafdf24 5 *
fc01f7e7
FB
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
3990d09a 24#include "config-host.h"
faf07963 25#include "qemu-common.h"
6d519a5f 26#include "trace.h"
737e150e
PB
27#include "block/block_int.h"
28#include "block/blockjob.h"
1de7afc9 29#include "qemu/module.h"
7b1b5d19 30#include "qapi/qmp/qjson.h"
bfb197e0 31#include "sysemu/block-backend.h"
9c17d615 32#include "sysemu/sysemu.h"
de50a20a 33#include "sysemu/qtest.h"
1de7afc9 34#include "qemu/notify.h"
737e150e 35#include "block/coroutine.h"
c13163fb 36#include "block/qapi.h"
b2023818 37#include "qmp-commands.h"
1de7afc9 38#include "qemu/timer.h"
a5ee7bd4 39#include "qapi-event.h"
fc01f7e7 40
71e72a19 41#ifdef CONFIG_BSD
7674e7bf
FB
42#include <sys/types.h>
43#include <sys/stat.h>
44#include <sys/ioctl.h>
72cf2d4f 45#include <sys/queue.h>
c5e97233 46#ifndef __DragonFly__
7674e7bf
FB
47#include <sys/disk.h>
48#endif
c5e97233 49#endif
7674e7bf 50
49dc768d
AL
51#ifdef _WIN32
52#include <windows.h>
53#endif
54
e4654d2d
FZ
55struct BdrvDirtyBitmap {
56 HBitmap *bitmap;
0db6e54a 57 char *name;
e4654d2d
FZ
58 QLIST_ENTRY(BdrvDirtyBitmap) list;
59};
60
1c9805a3
SH
61#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
62
7c84b1b8 63static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
f141eafe 64 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
097310b5 65 BlockCompletionFunc *cb, void *opaque);
7c84b1b8 66static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
f141eafe 67 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
097310b5 68 BlockCompletionFunc *cb, void *opaque);
f9f05dc5
KW
69static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
70 int64_t sector_num, int nb_sectors,
71 QEMUIOVector *iov);
72static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
73 int64_t sector_num, int nb_sectors,
74 QEMUIOVector *iov);
775aa8b6
KW
75static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
76 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
470c0504 77 BdrvRequestFlags flags);
775aa8b6
KW
78static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
79 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
f08f2dda 80 BdrvRequestFlags flags);
7c84b1b8
MA
81static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
82 int64_t sector_num,
83 QEMUIOVector *qiov,
84 int nb_sectors,
85 BdrvRequestFlags flags,
097310b5 86 BlockCompletionFunc *cb,
7c84b1b8
MA
87 void *opaque,
88 bool is_write);
b2a61371 89static void coroutine_fn bdrv_co_do_rw(void *opaque);
621f0589 90static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
aa7bfbff 91 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
ec530c81 92
1b7bdbc1
SH
93static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
94 QTAILQ_HEAD_INITIALIZER(bdrv_states);
7ee930d0 95
dc364f4c
BC
96static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
97 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
98
8a22f02a
SH
99static QLIST_HEAD(, BlockDriver) bdrv_drivers =
100 QLIST_HEAD_INITIALIZER(bdrv_drivers);
ea2384d3 101
c4237dfa
VSO
102static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
103 int nr_sectors);
104static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
105 int nr_sectors);
eb852011
MA
106/* If non-zero, use only whitelisted block drivers */
107static int use_bdrv_whitelist;
108
9e0b22f4
SH
109#ifdef _WIN32
110static int is_windows_drive_prefix(const char *filename)
111{
112 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
113 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
114 filename[1] == ':');
115}
116
117int is_windows_drive(const char *filename)
118{
119 if (is_windows_drive_prefix(filename) &&
120 filename[2] == '\0')
121 return 1;
122 if (strstart(filename, "\\\\.\\", NULL) ||
123 strstart(filename, "//./", NULL))
124 return 1;
125 return 0;
126}
127#endif
128
0563e191 129/* throttling disk I/O limits */
cc0681c4
BC
130void bdrv_set_io_limits(BlockDriverState *bs,
131 ThrottleConfig *cfg)
98f90dba 132{
cc0681c4 133 int i;
98f90dba 134
cc0681c4 135 throttle_config(&bs->throttle_state, cfg);
98f90dba 136
cc0681c4
BC
137 for (i = 0; i < 2; i++) {
138 qemu_co_enter_next(&bs->throttled_reqs[i]);
98f90dba 139 }
cc0681c4
BC
140}
141
142/* this function drain all the throttled IOs */
143static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
144{
145 bool drained = false;
146 bool enabled = bs->io_limits_enabled;
147 int i;
148
149 bs->io_limits_enabled = false;
150
151 for (i = 0; i < 2; i++) {
152 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
153 drained = true;
154 }
155 }
156
157 bs->io_limits_enabled = enabled;
98f90dba 158
cc0681c4 159 return drained;
98f90dba
ZYW
160}
161
cc0681c4 162void bdrv_io_limits_disable(BlockDriverState *bs)
0563e191 163{
cc0681c4 164 bs->io_limits_enabled = false;
0563e191 165
cc0681c4
BC
166 bdrv_start_throttled_reqs(bs);
167
168 throttle_destroy(&bs->throttle_state);
0563e191
ZYW
169}
170
cc0681c4 171static void bdrv_throttle_read_timer_cb(void *opaque)
0563e191 172{
cc0681c4
BC
173 BlockDriverState *bs = opaque;
174 qemu_co_enter_next(&bs->throttled_reqs[0]);
0563e191
ZYW
175}
176
cc0681c4 177static void bdrv_throttle_write_timer_cb(void *opaque)
0563e191 178{
cc0681c4
BC
179 BlockDriverState *bs = opaque;
180 qemu_co_enter_next(&bs->throttled_reqs[1]);
0563e191
ZYW
181}
182
cc0681c4
BC
183/* should be called before bdrv_set_io_limits if a limit is set */
184void bdrv_io_limits_enable(BlockDriverState *bs)
185{
de50a20a
FZ
186 int clock_type = QEMU_CLOCK_REALTIME;
187
188 if (qtest_enabled()) {
189 /* For testing block IO throttling only */
190 clock_type = QEMU_CLOCK_VIRTUAL;
191 }
cc0681c4
BC
192 assert(!bs->io_limits_enabled);
193 throttle_init(&bs->throttle_state,
13af91eb 194 bdrv_get_aio_context(bs),
de50a20a 195 clock_type,
cc0681c4
BC
196 bdrv_throttle_read_timer_cb,
197 bdrv_throttle_write_timer_cb,
198 bs);
199 bs->io_limits_enabled = true;
200}
201
202/* This function makes an IO wait if needed
203 *
204 * @nb_sectors: the number of sectors of the IO
205 * @is_write: is the IO a write
206 */
98f90dba 207static void bdrv_io_limits_intercept(BlockDriverState *bs,
d5103588 208 unsigned int bytes,
cc0681c4 209 bool is_write)
98f90dba 210{
cc0681c4
BC
211 /* does this io must wait */
212 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
98f90dba 213
cc0681c4
BC
214 /* if must wait or any request of this type throttled queue the IO */
215 if (must_wait ||
216 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
217 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
98f90dba
ZYW
218 }
219
cc0681c4 220 /* the IO will be executed, do the accounting */
d5103588
KW
221 throttle_account(&bs->throttle_state, is_write, bytes);
222
98f90dba 223
cc0681c4
BC
224 /* if the next request must wait -> do nothing */
225 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
226 return;
98f90dba
ZYW
227 }
228
cc0681c4
BC
229 /* else queue next request for execution */
230 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
98f90dba
ZYW
231}
232
339064d5
KW
233size_t bdrv_opt_mem_align(BlockDriverState *bs)
234{
235 if (!bs || !bs->drv) {
236 /* 4k should be on the safe side */
237 return 4096;
238 }
239
240 return bs->bl.opt_mem_alignment;
241}
242
9e0b22f4 243/* check if the path starts with "<protocol>:" */
5c98415b 244int path_has_protocol(const char *path)
9e0b22f4 245{
947995c0
PB
246 const char *p;
247
9e0b22f4
SH
248#ifdef _WIN32
249 if (is_windows_drive(path) ||
250 is_windows_drive_prefix(path)) {
251 return 0;
252 }
947995c0
PB
253 p = path + strcspn(path, ":/\\");
254#else
255 p = path + strcspn(path, ":/");
9e0b22f4
SH
256#endif
257
947995c0 258 return *p == ':';
9e0b22f4
SH
259}
260
83f64091 261int path_is_absolute(const char *path)
3b0d4f61 262{
21664424
FB
263#ifdef _WIN32
264 /* specific case for names like: "\\.\d:" */
f53f4da9 265 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
21664424 266 return 1;
f53f4da9
PB
267 }
268 return (*path == '/' || *path == '\\');
3b9f94e1 269#else
f53f4da9 270 return (*path == '/');
3b9f94e1 271#endif
3b0d4f61
FB
272}
273
83f64091
FB
274/* if filename is absolute, just copy it to dest. Otherwise, build a
275 path to it by considering it is relative to base_path. URL are
276 supported. */
277void path_combine(char *dest, int dest_size,
278 const char *base_path,
279 const char *filename)
3b0d4f61 280{
83f64091
FB
281 const char *p, *p1;
282 int len;
283
284 if (dest_size <= 0)
285 return;
286 if (path_is_absolute(filename)) {
287 pstrcpy(dest, dest_size, filename);
288 } else {
289 p = strchr(base_path, ':');
290 if (p)
291 p++;
292 else
293 p = base_path;
3b9f94e1
FB
294 p1 = strrchr(base_path, '/');
295#ifdef _WIN32
296 {
297 const char *p2;
298 p2 = strrchr(base_path, '\\');
299 if (!p1 || p2 > p1)
300 p1 = p2;
301 }
302#endif
83f64091
FB
303 if (p1)
304 p1++;
305 else
306 p1 = base_path;
307 if (p1 > p)
308 p = p1;
309 len = p - base_path;
310 if (len > dest_size - 1)
311 len = dest_size - 1;
312 memcpy(dest, base_path, len);
313 dest[len] = '\0';
314 pstrcat(dest, dest_size, filename);
3b0d4f61 315 }
3b0d4f61
FB
316}
317
0a82855a
HR
318void bdrv_get_full_backing_filename_from_filename(const char *backed,
319 const char *backing,
9f07429e
HR
320 char *dest, size_t sz,
321 Error **errp)
dc5a1371 322{
9f07429e
HR
323 if (backing[0] == '\0' || path_has_protocol(backing) ||
324 path_is_absolute(backing))
325 {
0a82855a 326 pstrcpy(dest, sz, backing);
9f07429e
HR
327 } else if (backed[0] == '\0' || strstart(backed, "json:", NULL)) {
328 error_setg(errp, "Cannot use relative backing file names for '%s'",
329 backed);
dc5a1371 330 } else {
0a82855a 331 path_combine(dest, sz, backed, backing);
dc5a1371
PB
332 }
333}
334
9f07429e
HR
335void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz,
336 Error **errp)
0a82855a 337{
9f07429e
HR
338 char *backed = bs->exact_filename[0] ? bs->exact_filename : bs->filename;
339
340 bdrv_get_full_backing_filename_from_filename(backed, bs->backing_file,
341 dest, sz, errp);
0a82855a
HR
342}
343
5efa9d5a 344void bdrv_register(BlockDriver *bdrv)
ea2384d3 345{
8c5873d6
SH
346 /* Block drivers without coroutine functions need emulation */
347 if (!bdrv->bdrv_co_readv) {
f9f05dc5
KW
348 bdrv->bdrv_co_readv = bdrv_co_readv_em;
349 bdrv->bdrv_co_writev = bdrv_co_writev_em;
350
f8c35c1d
SH
351 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
352 * the block driver lacks aio we need to emulate that too.
353 */
f9f05dc5
KW
354 if (!bdrv->bdrv_aio_readv) {
355 /* add AIO emulation layer */
356 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
357 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
f9f05dc5 358 }
83f64091 359 }
b2e12bc6 360
8a22f02a 361 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
ea2384d3 362}
b338082b 363
7f06d47e 364BlockDriverState *bdrv_new_root(void)
b338082b 365{
7f06d47e 366 BlockDriverState *bs = bdrv_new();
e4e9986b 367
e4e9986b 368 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
e4e9986b
MA
369 return bs;
370}
371
372BlockDriverState *bdrv_new(void)
373{
374 BlockDriverState *bs;
375 int i;
376
5839e53b 377 bs = g_new0(BlockDriverState, 1);
e4654d2d 378 QLIST_INIT(&bs->dirty_bitmaps);
fbe40ff7
FZ
379 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
380 QLIST_INIT(&bs->op_blockers[i]);
381 }
28a7282a 382 bdrv_iostatus_disable(bs);
d7d512f6 383 notifier_list_init(&bs->close_notifiers);
d616b224 384 notifier_with_return_list_init(&bs->before_write_notifiers);
cc0681c4
BC
385 qemu_co_queue_init(&bs->throttled_reqs[0]);
386 qemu_co_queue_init(&bs->throttled_reqs[1]);
9fcb0251 387 bs->refcnt = 1;
dcd04228 388 bs->aio_context = qemu_get_aio_context();
d7d512f6 389
b338082b
FB
390 return bs;
391}
392
d7d512f6
PB
393void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
394{
395 notifier_list_add(&bs->close_notifiers, notify);
396}
397
ea2384d3
FB
398BlockDriver *bdrv_find_format(const char *format_name)
399{
400 BlockDriver *drv1;
8a22f02a
SH
401 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
402 if (!strcmp(drv1->format_name, format_name)) {
ea2384d3 403 return drv1;
8a22f02a 404 }
ea2384d3
FB
405 }
406 return NULL;
407}
408
b64ec4e4 409static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
eb852011 410{
b64ec4e4
FZ
411 static const char *whitelist_rw[] = {
412 CONFIG_BDRV_RW_WHITELIST
413 };
414 static const char *whitelist_ro[] = {
415 CONFIG_BDRV_RO_WHITELIST
eb852011
MA
416 };
417 const char **p;
418
b64ec4e4 419 if (!whitelist_rw[0] && !whitelist_ro[0]) {
eb852011 420 return 1; /* no whitelist, anything goes */
b64ec4e4 421 }
eb852011 422
b64ec4e4 423 for (p = whitelist_rw; *p; p++) {
eb852011
MA
424 if (!strcmp(drv->format_name, *p)) {
425 return 1;
426 }
427 }
b64ec4e4
FZ
428 if (read_only) {
429 for (p = whitelist_ro; *p; p++) {
430 if (!strcmp(drv->format_name, *p)) {
431 return 1;
432 }
433 }
434 }
eb852011
MA
435 return 0;
436}
437
b64ec4e4
FZ
438BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
439 bool read_only)
eb852011
MA
440{
441 BlockDriver *drv = bdrv_find_format(format_name);
b64ec4e4 442 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
eb852011
MA
443}
444
5b7e1542
ZYW
445typedef struct CreateCo {
446 BlockDriver *drv;
447 char *filename;
83d0521a 448 QemuOpts *opts;
5b7e1542 449 int ret;
cc84d90f 450 Error *err;
5b7e1542
ZYW
451} CreateCo;
452
453static void coroutine_fn bdrv_create_co_entry(void *opaque)
454{
cc84d90f
HR
455 Error *local_err = NULL;
456 int ret;
457
5b7e1542
ZYW
458 CreateCo *cco = opaque;
459 assert(cco->drv);
460
c282e1fd 461 ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
84d18f06 462 if (local_err) {
cc84d90f
HR
463 error_propagate(&cco->err, local_err);
464 }
465 cco->ret = ret;
5b7e1542
ZYW
466}
467
0e7e1989 468int bdrv_create(BlockDriver *drv, const char* filename,
83d0521a 469 QemuOpts *opts, Error **errp)
ea2384d3 470{
5b7e1542
ZYW
471 int ret;
472
473 Coroutine *co;
474 CreateCo cco = {
475 .drv = drv,
476 .filename = g_strdup(filename),
83d0521a 477 .opts = opts,
5b7e1542 478 .ret = NOT_DONE,
cc84d90f 479 .err = NULL,
5b7e1542
ZYW
480 };
481
c282e1fd 482 if (!drv->bdrv_create) {
cc84d90f 483 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
80168bff
LC
484 ret = -ENOTSUP;
485 goto out;
5b7e1542
ZYW
486 }
487
488 if (qemu_in_coroutine()) {
489 /* Fast-path if already in coroutine context */
490 bdrv_create_co_entry(&cco);
491 } else {
492 co = qemu_coroutine_create(bdrv_create_co_entry);
493 qemu_coroutine_enter(co, &cco);
494 while (cco.ret == NOT_DONE) {
b47ec2c4 495 aio_poll(qemu_get_aio_context(), true);
5b7e1542
ZYW
496 }
497 }
498
499 ret = cco.ret;
cc84d90f 500 if (ret < 0) {
84d18f06 501 if (cco.err) {
cc84d90f
HR
502 error_propagate(errp, cco.err);
503 } else {
504 error_setg_errno(errp, -ret, "Could not create image");
505 }
506 }
0e7e1989 507
80168bff
LC
508out:
509 g_free(cco.filename);
5b7e1542 510 return ret;
ea2384d3
FB
511}
512
c282e1fd 513int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
84a12e66
CH
514{
515 BlockDriver *drv;
cc84d90f
HR
516 Error *local_err = NULL;
517 int ret;
84a12e66 518
b65a5e12 519 drv = bdrv_find_protocol(filename, true, errp);
84a12e66 520 if (drv == NULL) {
16905d71 521 return -ENOENT;
84a12e66
CH
522 }
523
c282e1fd 524 ret = bdrv_create(drv, filename, opts, &local_err);
84d18f06 525 if (local_err) {
cc84d90f
HR
526 error_propagate(errp, local_err);
527 }
528 return ret;
84a12e66
CH
529}
530
3baca891 531void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
d34682cd
KW
532{
533 BlockDriver *drv = bs->drv;
3baca891 534 Error *local_err = NULL;
d34682cd
KW
535
536 memset(&bs->bl, 0, sizeof(bs->bl));
537
466ad822 538 if (!drv) {
3baca891 539 return;
466ad822
KW
540 }
541
542 /* Take some limits from the children as a default */
543 if (bs->file) {
3baca891
KW
544 bdrv_refresh_limits(bs->file, &local_err);
545 if (local_err) {
546 error_propagate(errp, local_err);
547 return;
548 }
466ad822 549 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
2647fab5 550 bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
339064d5
KW
551 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
552 } else {
553 bs->bl.opt_mem_alignment = 512;
466ad822
KW
554 }
555
556 if (bs->backing_hd) {
3baca891
KW
557 bdrv_refresh_limits(bs->backing_hd, &local_err);
558 if (local_err) {
559 error_propagate(errp, local_err);
560 return;
561 }
466ad822
KW
562 bs->bl.opt_transfer_length =
563 MAX(bs->bl.opt_transfer_length,
564 bs->backing_hd->bl.opt_transfer_length);
2647fab5
PL
565 bs->bl.max_transfer_length =
566 MIN_NON_ZERO(bs->bl.max_transfer_length,
567 bs->backing_hd->bl.max_transfer_length);
339064d5
KW
568 bs->bl.opt_mem_alignment =
569 MAX(bs->bl.opt_mem_alignment,
570 bs->backing_hd->bl.opt_mem_alignment);
466ad822
KW
571 }
572
573 /* Then let the driver override it */
574 if (drv->bdrv_refresh_limits) {
3baca891 575 drv->bdrv_refresh_limits(bs, errp);
d34682cd 576 }
d34682cd
KW
577}
578
892b7de8
ET
579/**
580 * Try to get @bs's logical and physical block size.
581 * On success, store them in @bsz struct and return 0.
582 * On failure return -errno.
583 * @bs must not be empty.
584 */
585int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
586{
587 BlockDriver *drv = bs->drv;
588
589 if (drv && drv->bdrv_probe_blocksizes) {
590 return drv->bdrv_probe_blocksizes(bs, bsz);
591 }
592
593 return -ENOTSUP;
594}
595
596/**
597 * Try to get @bs's geometry (cyls, heads, sectors).
598 * On success, store them in @geo struct and return 0.
599 * On failure return -errno.
600 * @bs must not be empty.
601 */
602int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
603{
604 BlockDriver *drv = bs->drv;
605
606 if (drv && drv->bdrv_probe_geometry) {
607 return drv->bdrv_probe_geometry(bs, geo);
608 }
609
610 return -ENOTSUP;
611}
612
eba25057
JM
613/*
614 * Create a uniquely-named empty temporary file.
615 * Return 0 upon success, otherwise a negative errno value.
616 */
617int get_tmp_filename(char *filename, int size)
d5249393 618{
eba25057 619#ifdef _WIN32
3b9f94e1 620 char temp_dir[MAX_PATH];
eba25057
JM
621 /* GetTempFileName requires that its output buffer (4th param)
622 have length MAX_PATH or greater. */
623 assert(size >= MAX_PATH);
624 return (GetTempPath(MAX_PATH, temp_dir)
625 && GetTempFileName(temp_dir, "qem", 0, filename)
626 ? 0 : -GetLastError());
d5249393 627#else
67b915a5 628 int fd;
7ccfb2eb 629 const char *tmpdir;
0badc1ee 630 tmpdir = getenv("TMPDIR");
69bef793
AS
631 if (!tmpdir) {
632 tmpdir = "/var/tmp";
633 }
eba25057
JM
634 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
635 return -EOVERFLOW;
636 }
ea2384d3 637 fd = mkstemp(filename);
fe235a06
DH
638 if (fd < 0) {
639 return -errno;
640 }
641 if (close(fd) != 0) {
642 unlink(filename);
eba25057
JM
643 return -errno;
644 }
645 return 0;
d5249393 646#endif
eba25057 647}
fc01f7e7 648
84a12e66
CH
649/*
650 * Detect host devices. By convention, /dev/cdrom[N] is always
651 * recognized as a host CDROM.
652 */
653static BlockDriver *find_hdev_driver(const char *filename)
654{
655 int score_max = 0, score;
656 BlockDriver *drv = NULL, *d;
657
658 QLIST_FOREACH(d, &bdrv_drivers, list) {
659 if (d->bdrv_probe_device) {
660 score = d->bdrv_probe_device(filename);
661 if (score > score_max) {
662 score_max = score;
663 drv = d;
664 }
665 }
666 }
667
668 return drv;
669}
670
98289620 671BlockDriver *bdrv_find_protocol(const char *filename,
b65a5e12
HR
672 bool allow_protocol_prefix,
673 Error **errp)
83f64091
FB
674{
675 BlockDriver *drv1;
676 char protocol[128];
1cec71e3 677 int len;
83f64091 678 const char *p;
19cb3738 679
66f82cee
KW
680 /* TODO Drivers without bdrv_file_open must be specified explicitly */
681
39508e7a
CH
682 /*
683 * XXX(hch): we really should not let host device detection
684 * override an explicit protocol specification, but moving this
685 * later breaks access to device names with colons in them.
686 * Thanks to the brain-dead persistent naming schemes on udev-
687 * based Linux systems those actually are quite common.
688 */
689 drv1 = find_hdev_driver(filename);
690 if (drv1) {
691 return drv1;
692 }
693
98289620 694 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
ef810437 695 return &bdrv_file;
84a12e66 696 }
98289620 697
9e0b22f4
SH
698 p = strchr(filename, ':');
699 assert(p != NULL);
1cec71e3
AL
700 len = p - filename;
701 if (len > sizeof(protocol) - 1)
702 len = sizeof(protocol) - 1;
703 memcpy(protocol, filename, len);
704 protocol[len] = '\0';
8a22f02a 705 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
5fafdf24 706 if (drv1->protocol_name &&
8a22f02a 707 !strcmp(drv1->protocol_name, protocol)) {
83f64091 708 return drv1;
8a22f02a 709 }
83f64091 710 }
b65a5e12
HR
711
712 error_setg(errp, "Unknown protocol '%s'", protocol);
83f64091
FB
713 return NULL;
714}
715
c6684249
MA
716/*
717 * Guess image format by probing its contents.
718 * This is not a good idea when your image is raw (CVE-2008-2004), but
719 * we do it anyway for backward compatibility.
720 *
721 * @buf contains the image's first @buf_size bytes.
7cddd372
KW
722 * @buf_size is the buffer size in bytes (generally BLOCK_PROBE_BUF_SIZE,
723 * but can be smaller if the image file is smaller)
c6684249
MA
724 * @filename is its filename.
725 *
726 * For all block drivers, call the bdrv_probe() method to get its
727 * probing score.
728 * Return the first block driver with the highest probing score.
729 */
38f3ef57
KW
730BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
731 const char *filename)
c6684249
MA
732{
733 int score_max = 0, score;
734 BlockDriver *drv = NULL, *d;
735
736 QLIST_FOREACH(d, &bdrv_drivers, list) {
737 if (d->bdrv_probe) {
738 score = d->bdrv_probe(buf, buf_size, filename);
739 if (score > score_max) {
740 score_max = score;
741 drv = d;
742 }
743 }
744 }
745
746 return drv;
747}
748
f500a6d3 749static int find_image_format(BlockDriverState *bs, const char *filename,
34b5d2c6 750 BlockDriver **pdrv, Error **errp)
f3a5d3f8 751{
c6684249 752 BlockDriver *drv;
7cddd372 753 uint8_t buf[BLOCK_PROBE_BUF_SIZE];
f500a6d3 754 int ret = 0;
f8ea0b00 755
08a00559 756 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
8e895599 757 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
ef810437 758 *pdrv = &bdrv_raw;
c98ac35d 759 return ret;
1a396859 760 }
f8ea0b00 761
83f64091 762 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
83f64091 763 if (ret < 0) {
34b5d2c6
HR
764 error_setg_errno(errp, -ret, "Could not read image for determining its "
765 "format");
c98ac35d
SW
766 *pdrv = NULL;
767 return ret;
83f64091
FB
768 }
769
c6684249 770 drv = bdrv_probe_all(buf, ret, filename);
c98ac35d 771 if (!drv) {
34b5d2c6
HR
772 error_setg(errp, "Could not determine image format: No compatible "
773 "driver found");
c98ac35d
SW
774 ret = -ENOENT;
775 }
776 *pdrv = drv;
777 return ret;
ea2384d3
FB
778}
779
51762288
SH
780/**
781 * Set the current 'total_sectors' value
65a9bb25 782 * Return 0 on success, -errno on error.
51762288
SH
783 */
784static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
785{
786 BlockDriver *drv = bs->drv;
787
396759ad
NB
788 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
789 if (bs->sg)
790 return 0;
791
51762288
SH
792 /* query actual device if possible, otherwise just trust the hint */
793 if (drv->bdrv_getlength) {
794 int64_t length = drv->bdrv_getlength(bs);
795 if (length < 0) {
796 return length;
797 }
7e382003 798 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
51762288
SH
799 }
800
801 bs->total_sectors = hint;
802 return 0;
803}
804
9e8f1835
PB
805/**
806 * Set open flags for a given discard mode
807 *
808 * Return 0 on success, -1 if the discard mode was invalid.
809 */
810int bdrv_parse_discard_flags(const char *mode, int *flags)
811{
812 *flags &= ~BDRV_O_UNMAP;
813
814 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
815 /* do nothing */
816 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
817 *flags |= BDRV_O_UNMAP;
818 } else {
819 return -1;
820 }
821
822 return 0;
823}
824
c3993cdc
SH
825/**
826 * Set open flags for a given cache mode
827 *
828 * Return 0 on success, -1 if the cache mode was invalid.
829 */
830int bdrv_parse_cache_flags(const char *mode, int *flags)
831{
832 *flags &= ~BDRV_O_CACHE_MASK;
833
834 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
835 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
92196b2f
SH
836 } else if (!strcmp(mode, "directsync")) {
837 *flags |= BDRV_O_NOCACHE;
c3993cdc
SH
838 } else if (!strcmp(mode, "writeback")) {
839 *flags |= BDRV_O_CACHE_WB;
840 } else if (!strcmp(mode, "unsafe")) {
841 *flags |= BDRV_O_CACHE_WB;
842 *flags |= BDRV_O_NO_FLUSH;
843 } else if (!strcmp(mode, "writethrough")) {
844 /* this is the default */
845 } else {
846 return -1;
847 }
848
849 return 0;
850}
851
53fec9d3
SH
852/**
853 * The copy-on-read flag is actually a reference count so multiple users may
854 * use the feature without worrying about clobbering its previous state.
855 * Copy-on-read stays enabled until all users have called to disable it.
856 */
857void bdrv_enable_copy_on_read(BlockDriverState *bs)
858{
859 bs->copy_on_read++;
860}
861
862void bdrv_disable_copy_on_read(BlockDriverState *bs)
863{
864 assert(bs->copy_on_read > 0);
865 bs->copy_on_read--;
866}
867
b1e6fc08
KW
868/*
869 * Returns the flags that a temporary snapshot should get, based on the
870 * originally requested flags (the originally requested image will have flags
871 * like a backing file)
872 */
873static int bdrv_temp_snapshot_flags(int flags)
874{
875 return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
876}
877
0b50cc88
KW
878/*
879 * Returns the flags that bs->file should get, based on the given flags for
880 * the parent BDS
881 */
882static int bdrv_inherited_flags(int flags)
883{
884 /* Enable protocol handling, disable format probing for bs->file */
885 flags |= BDRV_O_PROTOCOL;
886
887 /* Our block drivers take care to send flushes and respect unmap policy,
888 * so we can enable both unconditionally on lower layers. */
889 flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
890
0b50cc88 891 /* Clear flags that only apply to the top layer */
5669b44d 892 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
0b50cc88
KW
893
894 return flags;
895}
896
317fc44e
KW
897/*
898 * Returns the flags that bs->backing_hd should get, based on the given flags
899 * for the parent BDS
900 */
901static int bdrv_backing_flags(int flags)
902{
903 /* backing files always opened read-only */
904 flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
905
906 /* snapshot=on is handled on the top layer */
8bfea15d 907 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
317fc44e
KW
908
909 return flags;
910}
911
7b272452
KW
912static int bdrv_open_flags(BlockDriverState *bs, int flags)
913{
914 int open_flags = flags | BDRV_O_CACHE_WB;
915
916 /*
917 * Clear flags that are internal to the block layer before opening the
918 * image.
919 */
20cca275 920 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
7b272452
KW
921
922 /*
923 * Snapshots should be writable.
924 */
8bfea15d 925 if (flags & BDRV_O_TEMPORARY) {
7b272452
KW
926 open_flags |= BDRV_O_RDWR;
927 }
928
929 return open_flags;
930}
931
636ea370
KW
932static void bdrv_assign_node_name(BlockDriverState *bs,
933 const char *node_name,
934 Error **errp)
6913c0c2
BC
935{
936 if (!node_name) {
636ea370 937 return;
6913c0c2
BC
938 }
939
9aebf3b8 940 /* Check for empty string or invalid characters */
f5bebbbb 941 if (!id_wellformed(node_name)) {
9aebf3b8 942 error_setg(errp, "Invalid node name");
636ea370 943 return;
6913c0c2
BC
944 }
945
0c5e94ee 946 /* takes care of avoiding namespaces collisions */
7f06d47e 947 if (blk_by_name(node_name)) {
0c5e94ee
BC
948 error_setg(errp, "node-name=%s is conflicting with a device id",
949 node_name);
636ea370 950 return;
0c5e94ee
BC
951 }
952
6913c0c2
BC
953 /* takes care of avoiding duplicates node names */
954 if (bdrv_find_node(node_name)) {
955 error_setg(errp, "Duplicate node name");
636ea370 956 return;
6913c0c2
BC
957 }
958
959 /* copy node name into the bs and insert it into the graph list */
960 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
961 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
6913c0c2
BC
962}
963
57915332
KW
964/*
965 * Common part for opening disk images and files
b6ad491a
KW
966 *
967 * Removes all processed options from *options.
57915332 968 */
f500a6d3 969static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
34b5d2c6 970 QDict *options, int flags, BlockDriver *drv, Error **errp)
57915332
KW
971{
972 int ret, open_flags;
035fccdf 973 const char *filename;
6913c0c2 974 const char *node_name = NULL;
34b5d2c6 975 Error *local_err = NULL;
57915332
KW
976
977 assert(drv != NULL);
6405875c 978 assert(bs->file == NULL);
707ff828 979 assert(options != NULL && bs->options != options);
57915332 980
45673671
KW
981 if (file != NULL) {
982 filename = file->filename;
983 } else {
984 filename = qdict_get_try_str(options, "filename");
985 }
986
765003db
KW
987 if (drv->bdrv_needs_filename && !filename) {
988 error_setg(errp, "The '%s' block driver requires a file name",
989 drv->format_name);
990 return -EINVAL;
991 }
992
45673671 993 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
28dcee10 994
6913c0c2 995 node_name = qdict_get_try_str(options, "node-name");
636ea370 996 bdrv_assign_node_name(bs, node_name, &local_err);
0fb6395c 997 if (local_err) {
636ea370
KW
998 error_propagate(errp, local_err);
999 return -EINVAL;
6913c0c2
BC
1000 }
1001 qdict_del(options, "node-name");
1002
5d186eb0
KW
1003 /* bdrv_open() with directly using a protocol as drv. This layer is already
1004 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
1005 * and return immediately. */
1006 if (file != NULL && drv->bdrv_file_open) {
1007 bdrv_swap(file, bs);
1008 return 0;
1009 }
1010
57915332 1011 bs->open_flags = flags;
1b7fd729 1012 bs->guest_block_size = 512;
c25f53b0 1013 bs->request_alignment = 512;
0d51b4de 1014 bs->zero_beyond_eof = true;
b64ec4e4
FZ
1015 open_flags = bdrv_open_flags(bs, flags);
1016 bs->read_only = !(open_flags & BDRV_O_RDWR);
1017
1018 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
8f94a6e4
KW
1019 error_setg(errp,
1020 !bs->read_only && bdrv_is_whitelisted(drv, true)
1021 ? "Driver '%s' can only be used for read-only devices"
1022 : "Driver '%s' is not whitelisted",
1023 drv->format_name);
b64ec4e4
FZ
1024 return -ENOTSUP;
1025 }
57915332 1026
53fec9d3 1027 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
0ebd24e0
KW
1028 if (flags & BDRV_O_COPY_ON_READ) {
1029 if (!bs->read_only) {
1030 bdrv_enable_copy_on_read(bs);
1031 } else {
1032 error_setg(errp, "Can't use copy-on-read on read-only device");
1033 return -EINVAL;
1034 }
53fec9d3
SH
1035 }
1036
c2ad1b0c
KW
1037 if (filename != NULL) {
1038 pstrcpy(bs->filename, sizeof(bs->filename), filename);
1039 } else {
1040 bs->filename[0] = '\0';
1041 }
91af7014 1042 pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
57915332 1043
57915332 1044 bs->drv = drv;
7267c094 1045 bs->opaque = g_malloc0(drv->instance_size);
57915332 1046
03f541bd 1047 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
e7c63796 1048
66f82cee
KW
1049 /* Open the image, either directly or using a protocol */
1050 if (drv->bdrv_file_open) {
5d186eb0 1051 assert(file == NULL);
030be321 1052 assert(!drv->bdrv_needs_filename || filename != NULL);
34b5d2c6 1053 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
f500a6d3 1054 } else {
2af5ef70 1055 if (file == NULL) {
34b5d2c6
HR
1056 error_setg(errp, "Can't use '%s' as a block driver for the "
1057 "protocol level", drv->format_name);
2af5ef70
KW
1058 ret = -EINVAL;
1059 goto free_and_fail;
1060 }
f500a6d3 1061 bs->file = file;
34b5d2c6 1062 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
66f82cee
KW
1063 }
1064
57915332 1065 if (ret < 0) {
84d18f06 1066 if (local_err) {
34b5d2c6 1067 error_propagate(errp, local_err);
2fa9aa59
DH
1068 } else if (bs->filename[0]) {
1069 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
34b5d2c6
HR
1070 } else {
1071 error_setg_errno(errp, -ret, "Could not open image");
1072 }
57915332
KW
1073 goto free_and_fail;
1074 }
1075
a1f688f4
MA
1076 if (bs->encrypted) {
1077 error_report("Encrypted images are deprecated");
1078 error_printf("Support for them will be removed in a future release.\n"
1079 "You can use 'qemu-img convert' to convert your image"
1080 " to an unencrypted one.\n");
1081 }
1082
51762288
SH
1083 ret = refresh_total_sectors(bs, bs->total_sectors);
1084 if (ret < 0) {
34b5d2c6 1085 error_setg_errno(errp, -ret, "Could not refresh total sector count");
51762288 1086 goto free_and_fail;
57915332 1087 }
51762288 1088
3baca891
KW
1089 bdrv_refresh_limits(bs, &local_err);
1090 if (local_err) {
1091 error_propagate(errp, local_err);
1092 ret = -EINVAL;
1093 goto free_and_fail;
1094 }
1095
c25f53b0 1096 assert(bdrv_opt_mem_align(bs) != 0);
47ea2de2 1097 assert((bs->request_alignment != 0) || bs->sg);
57915332
KW
1098 return 0;
1099
1100free_and_fail:
f500a6d3 1101 bs->file = NULL;
7267c094 1102 g_free(bs->opaque);
57915332
KW
1103 bs->opaque = NULL;
1104 bs->drv = NULL;
1105 return ret;
1106}
1107
5e5c4f63
KW
1108static QDict *parse_json_filename(const char *filename, Error **errp)
1109{
1110 QObject *options_obj;
1111 QDict *options;
1112 int ret;
1113
1114 ret = strstart(filename, "json:", &filename);
1115 assert(ret);
1116
1117 options_obj = qobject_from_json(filename);
1118 if (!options_obj) {
1119 error_setg(errp, "Could not parse the JSON options");
1120 return NULL;
1121 }
1122
1123 if (qobject_type(options_obj) != QTYPE_QDICT) {
1124 qobject_decref(options_obj);
1125 error_setg(errp, "Invalid JSON object given");
1126 return NULL;
1127 }
1128
1129 options = qobject_to_qdict(options_obj);
1130 qdict_flatten(options);
1131
1132 return options;
1133}
1134
b6ce07aa 1135/*
f54120ff
KW
1136 * Fills in default options for opening images and converts the legacy
1137 * filename/flags pair to option QDict entries.
b6ce07aa 1138 */
5e5c4f63 1139static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
17b005f1 1140 BlockDriver *drv, Error **errp)
ea2384d3 1141{
5e5c4f63 1142 const char *filename = *pfilename;
c2ad1b0c 1143 const char *drvname;
462f5bcf 1144 bool protocol = flags & BDRV_O_PROTOCOL;
e3fa4bfa 1145 bool parse_filename = false;
34b5d2c6 1146 Error *local_err = NULL;
83f64091 1147
5e5c4f63
KW
1148 /* Parse json: pseudo-protocol */
1149 if (filename && g_str_has_prefix(filename, "json:")) {
1150 QDict *json_options = parse_json_filename(filename, &local_err);
1151 if (local_err) {
1152 error_propagate(errp, local_err);
1153 return -EINVAL;
1154 }
1155
1156 /* Options given in the filename have lower priority than options
1157 * specified directly */
1158 qdict_join(*options, json_options, false);
1159 QDECREF(json_options);
1160 *pfilename = filename = NULL;
1161 }
1162
035fccdf 1163 /* Fetch the file name from the options QDict if necessary */
17b005f1 1164 if (protocol && filename) {
f54120ff
KW
1165 if (!qdict_haskey(*options, "filename")) {
1166 qdict_put(*options, "filename", qstring_from_str(filename));
1167 parse_filename = true;
1168 } else {
1169 error_setg(errp, "Can't specify 'file' and 'filename' options at "
1170 "the same time");
1171 return -EINVAL;
1172 }
035fccdf
KW
1173 }
1174
c2ad1b0c 1175 /* Find the right block driver */
f54120ff 1176 filename = qdict_get_try_str(*options, "filename");
5acd9d81 1177 drvname = qdict_get_try_str(*options, "driver");
f54120ff 1178
17b005f1
KW
1179 if (drv) {
1180 if (drvname) {
1181 error_setg(errp, "Driver specified twice");
1182 return -EINVAL;
1183 }
1184 drvname = drv->format_name;
1185 qdict_put(*options, "driver", qstring_from_str(drvname));
1186 } else {
1187 if (!drvname && protocol) {
1188 if (filename) {
b65a5e12 1189 drv = bdrv_find_protocol(filename, parse_filename, errp);
17b005f1 1190 if (!drv) {
17b005f1
KW
1191 return -EINVAL;
1192 }
1193
1194 drvname = drv->format_name;
1195 qdict_put(*options, "driver", qstring_from_str(drvname));
1196 } else {
1197 error_setg(errp, "Must specify either driver or file");
f54120ff
KW
1198 return -EINVAL;
1199 }
17b005f1
KW
1200 } else if (drvname) {
1201 drv = bdrv_find_format(drvname);
1202 if (!drv) {
1203 error_setg(errp, "Unknown driver '%s'", drvname);
1204 return -ENOENT;
1205 }
98289620 1206 }
c2ad1b0c
KW
1207 }
1208
17b005f1 1209 assert(drv || !protocol);
c2ad1b0c 1210
f54120ff 1211 /* Driver-specific filename parsing */
17b005f1 1212 if (drv && drv->bdrv_parse_filename && parse_filename) {
5acd9d81 1213 drv->bdrv_parse_filename(filename, *options, &local_err);
84d18f06 1214 if (local_err) {
34b5d2c6 1215 error_propagate(errp, local_err);
f54120ff 1216 return -EINVAL;
6963a30d 1217 }
cd5d031e
HR
1218
1219 if (!drv->bdrv_needs_filename) {
1220 qdict_del(*options, "filename");
cd5d031e 1221 }
6963a30d
KW
1222 }
1223
f54120ff
KW
1224 return 0;
1225}
1226
8d24cce1
FZ
1227void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1228{
1229
826b6ca0
FZ
1230 if (bs->backing_hd) {
1231 assert(bs->backing_blocker);
1232 bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1233 } else if (backing_hd) {
1234 error_setg(&bs->backing_blocker,
81e5f78a
AG
1235 "node is used as backing hd of '%s'",
1236 bdrv_get_device_or_node_name(bs));
826b6ca0
FZ
1237 }
1238
8d24cce1
FZ
1239 bs->backing_hd = backing_hd;
1240 if (!backing_hd) {
826b6ca0
FZ
1241 error_free(bs->backing_blocker);
1242 bs->backing_blocker = NULL;
8d24cce1
FZ
1243 goto out;
1244 }
1245 bs->open_flags &= ~BDRV_O_NO_BACKING;
1246 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1247 pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1248 backing_hd->drv ? backing_hd->drv->format_name : "");
826b6ca0
FZ
1249
1250 bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1251 /* Otherwise we won't be able to commit due to check in bdrv_commit */
bb00021d 1252 bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET,
826b6ca0 1253 bs->backing_blocker);
8d24cce1 1254out:
3baca891 1255 bdrv_refresh_limits(bs, NULL);
8d24cce1
FZ
1256}
1257
31ca6d07
KW
1258/*
1259 * Opens the backing file for a BlockDriverState if not yet open
1260 *
1261 * options is a QDict of options to pass to the block drivers, or NULL for an
1262 * empty set of options. The reference to the QDict is transferred to this
1263 * function (even on failure), so if the caller intends to reuse the dictionary,
1264 * it needs to use QINCREF() before calling bdrv_file_open.
1265 */
34b5d2c6 1266int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
9156df12 1267{
1ba4b6a5 1268 char *backing_filename = g_malloc0(PATH_MAX);
317fc44e 1269 int ret = 0;
8d24cce1 1270 BlockDriverState *backing_hd;
34b5d2c6 1271 Error *local_err = NULL;
9156df12
PB
1272
1273 if (bs->backing_hd != NULL) {
31ca6d07 1274 QDECREF(options);
1ba4b6a5 1275 goto free_exit;
9156df12
PB
1276 }
1277
31ca6d07
KW
1278 /* NULL means an empty set of options */
1279 if (options == NULL) {
1280 options = qdict_new();
1281 }
1282
9156df12 1283 bs->open_flags &= ~BDRV_O_NO_BACKING;
1cb6f506
KW
1284 if (qdict_haskey(options, "file.filename")) {
1285 backing_filename[0] = '\0';
1286 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
31ca6d07 1287 QDECREF(options);
1ba4b6a5 1288 goto free_exit;
dbecebdd 1289 } else {
9f07429e
HR
1290 bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX,
1291 &local_err);
1292 if (local_err) {
1293 ret = -EINVAL;
1294 error_propagate(errp, local_err);
1295 QDECREF(options);
1296 goto free_exit;
1297 }
9156df12
PB
1298 }
1299
8ee79e70
KW
1300 if (!bs->drv || !bs->drv->supports_backing) {
1301 ret = -EINVAL;
1302 error_setg(errp, "Driver doesn't support backing files");
1303 QDECREF(options);
1304 goto free_exit;
1305 }
1306
e4e9986b 1307 backing_hd = bdrv_new();
8d24cce1 1308
c5f6e493
KW
1309 if (bs->backing_format[0] != '\0' && !qdict_haskey(options, "driver")) {
1310 qdict_put(options, "driver", qstring_from_str(bs->backing_format));
9156df12
PB
1311 }
1312
f67503e5 1313 assert(bs->backing_hd == NULL);
8d24cce1 1314 ret = bdrv_open(&backing_hd,
ddf5636d 1315 *backing_filename ? backing_filename : NULL, NULL, options,
c5f6e493 1316 bdrv_backing_flags(bs->open_flags), NULL, &local_err);
9156df12 1317 if (ret < 0) {
8d24cce1
FZ
1318 bdrv_unref(backing_hd);
1319 backing_hd = NULL;
9156df12 1320 bs->open_flags |= BDRV_O_NO_BACKING;
b04b6b6e
FZ
1321 error_setg(errp, "Could not open backing file: %s",
1322 error_get_pretty(local_err));
1323 error_free(local_err);
1ba4b6a5 1324 goto free_exit;
9156df12 1325 }
8d24cce1 1326 bdrv_set_backing_hd(bs, backing_hd);
d80ac658 1327
1ba4b6a5
BC
1328free_exit:
1329 g_free(backing_filename);
1330 return ret;
9156df12
PB
1331}
1332
da557aac
HR
1333/*
1334 * Opens a disk image whose options are given as BlockdevRef in another block
1335 * device's options.
1336 *
da557aac
HR
1337 * If allow_none is true, no image will be opened if filename is false and no
1338 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1339 *
1340 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1341 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1342 * itself, all options starting with "${bdref_key}." are considered part of the
1343 * BlockdevRef.
1344 *
1345 * The BlockdevRef will be removed from the options QDict.
f67503e5
HR
1346 *
1347 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
da557aac
HR
1348 */
1349int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1350 QDict *options, const char *bdref_key, int flags,
f7d9fd8c 1351 bool allow_none, Error **errp)
da557aac
HR
1352{
1353 QDict *image_options;
1354 int ret;
1355 char *bdref_key_dot;
1356 const char *reference;
1357
f67503e5
HR
1358 assert(pbs);
1359 assert(*pbs == NULL);
1360
da557aac
HR
1361 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1362 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1363 g_free(bdref_key_dot);
1364
1365 reference = qdict_get_try_str(options, bdref_key);
1366 if (!filename && !reference && !qdict_size(image_options)) {
1367 if (allow_none) {
1368 ret = 0;
1369 } else {
1370 error_setg(errp, "A block device must be specified for \"%s\"",
1371 bdref_key);
1372 ret = -EINVAL;
1373 }
b20e61e0 1374 QDECREF(image_options);
da557aac
HR
1375 goto done;
1376 }
1377
f7d9fd8c 1378 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
da557aac
HR
1379
1380done:
1381 qdict_del(options, bdref_key);
1382 return ret;
1383}
1384
6b8aeca5 1385int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
b998875d
KW
1386{
1387 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1ba4b6a5 1388 char *tmp_filename = g_malloc0(PATH_MAX + 1);
b998875d 1389 int64_t total_size;
83d0521a 1390 QemuOpts *opts = NULL;
b998875d
KW
1391 QDict *snapshot_options;
1392 BlockDriverState *bs_snapshot;
1393 Error *local_err;
1394 int ret;
1395
1396 /* if snapshot, we create a temporary backing file and open it
1397 instead of opening 'filename' directly */
1398
1399 /* Get the required size from the image */
f187743a
KW
1400 total_size = bdrv_getlength(bs);
1401 if (total_size < 0) {
6b8aeca5 1402 ret = total_size;
f187743a 1403 error_setg_errno(errp, -total_size, "Could not get image size");
1ba4b6a5 1404 goto out;
f187743a 1405 }
b998875d
KW
1406
1407 /* Create the temporary image */
1ba4b6a5 1408 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
b998875d
KW
1409 if (ret < 0) {
1410 error_setg_errno(errp, -ret, "Could not get temporary filename");
1ba4b6a5 1411 goto out;
b998875d
KW
1412 }
1413
ef810437 1414 opts = qemu_opts_create(bdrv_qcow2.create_opts, NULL, 0,
c282e1fd 1415 &error_abort);
39101f25 1416 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size, &error_abort);
ef810437 1417 ret = bdrv_create(&bdrv_qcow2, tmp_filename, opts, &local_err);
83d0521a 1418 qemu_opts_del(opts);
b998875d
KW
1419 if (ret < 0) {
1420 error_setg_errno(errp, -ret, "Could not create temporary overlay "
1421 "'%s': %s", tmp_filename,
1422 error_get_pretty(local_err));
1423 error_free(local_err);
1ba4b6a5 1424 goto out;
b998875d
KW
1425 }
1426
1427 /* Prepare a new options QDict for the temporary file */
1428 snapshot_options = qdict_new();
1429 qdict_put(snapshot_options, "file.driver",
1430 qstring_from_str("file"));
1431 qdict_put(snapshot_options, "file.filename",
1432 qstring_from_str(tmp_filename));
1433
e4e9986b 1434 bs_snapshot = bdrv_new();
b998875d
KW
1435
1436 ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
ef810437 1437 flags, &bdrv_qcow2, &local_err);
b998875d
KW
1438 if (ret < 0) {
1439 error_propagate(errp, local_err);
1ba4b6a5 1440 goto out;
b998875d
KW
1441 }
1442
1443 bdrv_append(bs_snapshot, bs);
1ba4b6a5
BC
1444
1445out:
1446 g_free(tmp_filename);
6b8aeca5 1447 return ret;
b998875d
KW
1448}
1449
b6ce07aa
KW
1450/*
1451 * Opens a disk image (raw, qcow2, vmdk, ...)
de9c0cec
KW
1452 *
1453 * options is a QDict of options to pass to the block drivers, or NULL for an
1454 * empty set of options. The reference to the QDict belongs to the block layer
1455 * after the call (even on failure), so if the caller intends to reuse the
1456 * dictionary, it needs to use QINCREF() before calling bdrv_open.
f67503e5
HR
1457 *
1458 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1459 * If it is not NULL, the referenced BDS will be reused.
ddf5636d
HR
1460 *
1461 * The reference parameter may be used to specify an existing block device which
1462 * should be opened. If specified, neither options nor a filename may be given,
1463 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
b6ce07aa 1464 */
ddf5636d
HR
1465int bdrv_open(BlockDriverState **pbs, const char *filename,
1466 const char *reference, QDict *options, int flags,
1467 BlockDriver *drv, Error **errp)
ea2384d3 1468{
b6ce07aa 1469 int ret;
f67503e5 1470 BlockDriverState *file = NULL, *bs;
74fe54f2 1471 const char *drvname;
34b5d2c6 1472 Error *local_err = NULL;
b1e6fc08 1473 int snapshot_flags = 0;
712e7874 1474
f67503e5
HR
1475 assert(pbs);
1476
ddf5636d
HR
1477 if (reference) {
1478 bool options_non_empty = options ? qdict_size(options) : false;
1479 QDECREF(options);
1480
1481 if (*pbs) {
1482 error_setg(errp, "Cannot reuse an existing BDS when referencing "
1483 "another block device");
1484 return -EINVAL;
1485 }
1486
1487 if (filename || options_non_empty) {
1488 error_setg(errp, "Cannot reference an existing block device with "
1489 "additional options or a new filename");
1490 return -EINVAL;
1491 }
1492
1493 bs = bdrv_lookup_bs(reference, reference, errp);
1494 if (!bs) {
1495 return -ENODEV;
1496 }
1497 bdrv_ref(bs);
1498 *pbs = bs;
1499 return 0;
1500 }
1501
f67503e5
HR
1502 if (*pbs) {
1503 bs = *pbs;
1504 } else {
e4e9986b 1505 bs = bdrv_new();
f67503e5
HR
1506 }
1507
de9c0cec
KW
1508 /* NULL means an empty set of options */
1509 if (options == NULL) {
1510 options = qdict_new();
1511 }
1512
17b005f1 1513 ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
462f5bcf
KW
1514 if (local_err) {
1515 goto fail;
1516 }
1517
76c591b0
KW
1518 /* Find the right image format driver */
1519 drv = NULL;
1520 drvname = qdict_get_try_str(options, "driver");
1521 if (drvname) {
1522 drv = bdrv_find_format(drvname);
1523 qdict_del(options, "driver");
1524 if (!drv) {
1525 error_setg(errp, "Unknown driver: '%s'", drvname);
1526 ret = -EINVAL;
1527 goto fail;
1528 }
1529 }
1530
1531 assert(drvname || !(flags & BDRV_O_PROTOCOL));
1532 if (drv && !drv->bdrv_file_open) {
1533 /* If the user explicitly wants a format driver here, we'll need to add
1534 * another layer for the protocol in bs->file */
1535 flags &= ~BDRV_O_PROTOCOL;
1536 }
1537
de9c0cec 1538 bs->options = options;
b6ad491a 1539 options = qdict_clone_shallow(options);
de9c0cec 1540
f500a6d3 1541 /* Open image file without format layer */
f4788adc
KW
1542 if ((flags & BDRV_O_PROTOCOL) == 0) {
1543 if (flags & BDRV_O_RDWR) {
1544 flags |= BDRV_O_ALLOW_RDWR;
1545 }
1546 if (flags & BDRV_O_SNAPSHOT) {
1547 snapshot_flags = bdrv_temp_snapshot_flags(flags);
1548 flags = bdrv_backing_flags(flags);
1549 }
f500a6d3 1550
f4788adc
KW
1551 assert(file == NULL);
1552 ret = bdrv_open_image(&file, filename, options, "file",
1553 bdrv_inherited_flags(flags),
1554 true, &local_err);
1555 if (ret < 0) {
1556 goto fail;
1557 }
f500a6d3
KW
1558 }
1559
76c591b0 1560 /* Image format probing */
38f3ef57 1561 bs->probed = !drv;
76c591b0 1562 if (!drv && file) {
17b005f1
KW
1563 ret = find_image_format(file, filename, &drv, &local_err);
1564 if (ret < 0) {
8bfea15d 1565 goto fail;
2a05cbe4 1566 }
76c591b0 1567 } else if (!drv) {
17b005f1
KW
1568 error_setg(errp, "Must specify either driver or file");
1569 ret = -EINVAL;
8bfea15d 1570 goto fail;
ea2384d3 1571 }
b6ce07aa
KW
1572
1573 /* Open the image */
34b5d2c6 1574 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
b6ce07aa 1575 if (ret < 0) {
8bfea15d 1576 goto fail;
6987307c
CH
1577 }
1578
2a05cbe4 1579 if (file && (bs->file != file)) {
4f6fd349 1580 bdrv_unref(file);
f500a6d3
KW
1581 file = NULL;
1582 }
1583
b6ce07aa 1584 /* If there is a backing file, use it */
9156df12 1585 if ((flags & BDRV_O_NO_BACKING) == 0) {
31ca6d07
KW
1586 QDict *backing_options;
1587
5726d872 1588 qdict_extract_subqdict(options, &backing_options, "backing.");
34b5d2c6 1589 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
b6ce07aa 1590 if (ret < 0) {
b6ad491a 1591 goto close_and_fail;
b6ce07aa 1592 }
b6ce07aa
KW
1593 }
1594
91af7014
HR
1595 bdrv_refresh_filename(bs);
1596
b998875d
KW
1597 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1598 * temporary snapshot afterwards. */
b1e6fc08 1599 if (snapshot_flags) {
6b8aeca5 1600 ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
b998875d 1601 if (local_err) {
b998875d
KW
1602 goto close_and_fail;
1603 }
1604 }
1605
b6ad491a 1606 /* Check if any unknown options were used */
5acd9d81 1607 if (options && (qdict_size(options) != 0)) {
b6ad491a 1608 const QDictEntry *entry = qdict_first(options);
5acd9d81
HR
1609 if (flags & BDRV_O_PROTOCOL) {
1610 error_setg(errp, "Block protocol '%s' doesn't support the option "
1611 "'%s'", drv->format_name, entry->key);
1612 } else {
1613 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1614 "support the option '%s'", drv->format_name,
bfb197e0 1615 bdrv_get_device_name(bs), entry->key);
5acd9d81 1616 }
b6ad491a
KW
1617
1618 ret = -EINVAL;
1619 goto close_and_fail;
1620 }
b6ad491a 1621
b6ce07aa 1622 if (!bdrv_key_required(bs)) {
a7f53e26
MA
1623 if (bs->blk) {
1624 blk_dev_change_media_cb(bs->blk, true);
1625 }
c3adb58f
MA
1626 } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1627 && !runstate_check(RUN_STATE_INMIGRATE)
1628 && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1629 error_setg(errp,
1630 "Guest must be stopped for opening of encrypted image");
1631 ret = -EBUSY;
1632 goto close_and_fail;
b6ce07aa
KW
1633 }
1634
c3adb58f 1635 QDECREF(options);
f67503e5 1636 *pbs = bs;
b6ce07aa
KW
1637 return 0;
1638
8bfea15d 1639fail:
f500a6d3 1640 if (file != NULL) {
4f6fd349 1641 bdrv_unref(file);
f500a6d3 1642 }
de9c0cec 1643 QDECREF(bs->options);
b6ad491a 1644 QDECREF(options);
de9c0cec 1645 bs->options = NULL;
f67503e5
HR
1646 if (!*pbs) {
1647 /* If *pbs is NULL, a new BDS has been created in this function and
1648 needs to be freed now. Otherwise, it does not need to be closed,
1649 since it has not really been opened yet. */
1650 bdrv_unref(bs);
1651 }
84d18f06 1652 if (local_err) {
34b5d2c6
HR
1653 error_propagate(errp, local_err);
1654 }
b6ad491a 1655 return ret;
de9c0cec 1656
b6ad491a 1657close_and_fail:
f67503e5
HR
1658 /* See fail path, but now the BDS has to be always closed */
1659 if (*pbs) {
1660 bdrv_close(bs);
1661 } else {
1662 bdrv_unref(bs);
1663 }
b6ad491a 1664 QDECREF(options);
84d18f06 1665 if (local_err) {
34b5d2c6
HR
1666 error_propagate(errp, local_err);
1667 }
b6ce07aa
KW
1668 return ret;
1669}
1670
e971aa12
JC
1671typedef struct BlockReopenQueueEntry {
1672 bool prepared;
1673 BDRVReopenState state;
1674 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1675} BlockReopenQueueEntry;
1676
1677/*
1678 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1679 * reopen of multiple devices.
1680 *
1681 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1682 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1683 * be created and initialized. This newly created BlockReopenQueue should be
1684 * passed back in for subsequent calls that are intended to be of the same
1685 * atomic 'set'.
1686 *
1687 * bs is the BlockDriverState to add to the reopen queue.
1688 *
1689 * flags contains the open flags for the associated bs
1690 *
1691 * returns a pointer to bs_queue, which is either the newly allocated
1692 * bs_queue, or the existing bs_queue being used.
1693 *
1694 */
1695BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1696 BlockDriverState *bs, int flags)
1697{
1698 assert(bs != NULL);
1699
1700 BlockReopenQueueEntry *bs_entry;
1701 if (bs_queue == NULL) {
1702 bs_queue = g_new0(BlockReopenQueue, 1);
1703 QSIMPLEQ_INIT(bs_queue);
1704 }
1705
f1f25a2e
KW
1706 /* bdrv_open() masks this flag out */
1707 flags &= ~BDRV_O_PROTOCOL;
1708
e971aa12 1709 if (bs->file) {
f1f25a2e 1710 bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
e971aa12
JC
1711 }
1712
1713 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1714 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1715
1716 bs_entry->state.bs = bs;
1717 bs_entry->state.flags = flags;
1718
1719 return bs_queue;
1720}
1721
1722/*
1723 * Reopen multiple BlockDriverStates atomically & transactionally.
1724 *
1725 * The queue passed in (bs_queue) must have been built up previous
1726 * via bdrv_reopen_queue().
1727 *
1728 * Reopens all BDS specified in the queue, with the appropriate
1729 * flags. All devices are prepared for reopen, and failure of any
1730 * device will cause all device changes to be abandonded, and intermediate
1731 * data cleaned up.
1732 *
1733 * If all devices prepare successfully, then the changes are committed
1734 * to all devices.
1735 *
1736 */
1737int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1738{
1739 int ret = -1;
1740 BlockReopenQueueEntry *bs_entry, *next;
1741 Error *local_err = NULL;
1742
1743 assert(bs_queue != NULL);
1744
1745 bdrv_drain_all();
1746
1747 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1748 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1749 error_propagate(errp, local_err);
1750 goto cleanup;
1751 }
1752 bs_entry->prepared = true;
1753 }
1754
1755 /* If we reach this point, we have success and just need to apply the
1756 * changes
1757 */
1758 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1759 bdrv_reopen_commit(&bs_entry->state);
1760 }
1761
1762 ret = 0;
1763
1764cleanup:
1765 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1766 if (ret && bs_entry->prepared) {
1767 bdrv_reopen_abort(&bs_entry->state);
1768 }
1769 g_free(bs_entry);
1770 }
1771 g_free(bs_queue);
1772 return ret;
1773}
1774
1775
1776/* Reopen a single BlockDriverState with the specified flags. */
1777int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1778{
1779 int ret = -1;
1780 Error *local_err = NULL;
1781 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1782
1783 ret = bdrv_reopen_multiple(queue, &local_err);
1784 if (local_err != NULL) {
1785 error_propagate(errp, local_err);
1786 }
1787 return ret;
1788}
1789
1790
1791/*
1792 * Prepares a BlockDriverState for reopen. All changes are staged in the
1793 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1794 * the block driver layer .bdrv_reopen_prepare()
1795 *
1796 * bs is the BlockDriverState to reopen
1797 * flags are the new open flags
1798 * queue is the reopen queue
1799 *
1800 * Returns 0 on success, non-zero on error. On error errp will be set
1801 * as well.
1802 *
1803 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1804 * It is the responsibility of the caller to then call the abort() or
1805 * commit() for any other BDS that have been left in a prepare() state
1806 *
1807 */
1808int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1809 Error **errp)
1810{
1811 int ret = -1;
1812 Error *local_err = NULL;
1813 BlockDriver *drv;
1814
1815 assert(reopen_state != NULL);
1816 assert(reopen_state->bs->drv != NULL);
1817 drv = reopen_state->bs->drv;
1818
1819 /* if we are to stay read-only, do not allow permission change
1820 * to r/w */
1821 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1822 reopen_state->flags & BDRV_O_RDWR) {
81e5f78a
AG
1823 error_setg(errp, "Node '%s' is read only",
1824 bdrv_get_device_or_node_name(reopen_state->bs));
e971aa12
JC
1825 goto error;
1826 }
1827
1828
1829 ret = bdrv_flush(reopen_state->bs);
1830 if (ret) {
1831 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1832 strerror(-ret));
1833 goto error;
1834 }
1835
1836 if (drv->bdrv_reopen_prepare) {
1837 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1838 if (ret) {
1839 if (local_err != NULL) {
1840 error_propagate(errp, local_err);
1841 } else {
d8b6895f
LC
1842 error_setg(errp, "failed while preparing to reopen image '%s'",
1843 reopen_state->bs->filename);
e971aa12
JC
1844 }
1845 goto error;
1846 }
1847 } else {
1848 /* It is currently mandatory to have a bdrv_reopen_prepare()
1849 * handler for each supported drv. */
81e5f78a
AG
1850 error_setg(errp, "Block format '%s' used by node '%s' "
1851 "does not support reopening files", drv->format_name,
1852 bdrv_get_device_or_node_name(reopen_state->bs));
e971aa12
JC
1853 ret = -1;
1854 goto error;
1855 }
1856
1857 ret = 0;
1858
1859error:
1860 return ret;
1861}
1862
1863/*
1864 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1865 * makes them final by swapping the staging BlockDriverState contents into
1866 * the active BlockDriverState contents.
1867 */
1868void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1869{
1870 BlockDriver *drv;
1871
1872 assert(reopen_state != NULL);
1873 drv = reopen_state->bs->drv;
1874 assert(drv != NULL);
1875
1876 /* If there are any driver level actions to take */
1877 if (drv->bdrv_reopen_commit) {
1878 drv->bdrv_reopen_commit(reopen_state);
1879 }
1880
1881 /* set BDS specific flags now */
1882 reopen_state->bs->open_flags = reopen_state->flags;
1883 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1884 BDRV_O_CACHE_WB);
1885 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
355ef4ac 1886
3baca891 1887 bdrv_refresh_limits(reopen_state->bs, NULL);
e971aa12
JC
1888}
1889
1890/*
1891 * Abort the reopen, and delete and free the staged changes in
1892 * reopen_state
1893 */
1894void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1895{
1896 BlockDriver *drv;
1897
1898 assert(reopen_state != NULL);
1899 drv = reopen_state->bs->drv;
1900 assert(drv != NULL);
1901
1902 if (drv->bdrv_reopen_abort) {
1903 drv->bdrv_reopen_abort(reopen_state);
1904 }
1905}
1906
1907
fc01f7e7
FB
1908void bdrv_close(BlockDriverState *bs)
1909{
33384421
HR
1910 BdrvAioNotifier *ban, *ban_next;
1911
3cbc002c
PB
1912 if (bs->job) {
1913 block_job_cancel_sync(bs->job);
1914 }
58fda173
SH
1915 bdrv_drain_all(); /* complete I/O */
1916 bdrv_flush(bs);
1917 bdrv_drain_all(); /* in case flush left pending I/O */
d7d512f6 1918 notifier_list_notify(&bs->close_notifiers, bs);
7094f12f 1919
3cbc002c 1920 if (bs->drv) {
557df6ac 1921 if (bs->backing_hd) {
826b6ca0
FZ
1922 BlockDriverState *backing_hd = bs->backing_hd;
1923 bdrv_set_backing_hd(bs, NULL);
1924 bdrv_unref(backing_hd);
557df6ac 1925 }
ea2384d3 1926 bs->drv->bdrv_close(bs);
7267c094 1927 g_free(bs->opaque);
ea2384d3
FB
1928 bs->opaque = NULL;
1929 bs->drv = NULL;
53fec9d3 1930 bs->copy_on_read = 0;
a275fa42
PB
1931 bs->backing_file[0] = '\0';
1932 bs->backing_format[0] = '\0';
6405875c
PB
1933 bs->total_sectors = 0;
1934 bs->encrypted = 0;
1935 bs->valid_key = 0;
1936 bs->sg = 0;
0d51b4de 1937 bs->zero_beyond_eof = false;
de9c0cec
KW
1938 QDECREF(bs->options);
1939 bs->options = NULL;
91af7014
HR
1940 QDECREF(bs->full_open_options);
1941 bs->full_open_options = NULL;
b338082b 1942
66f82cee 1943 if (bs->file != NULL) {
4f6fd349 1944 bdrv_unref(bs->file);
0ac9377d 1945 bs->file = NULL;
66f82cee 1946 }
b338082b 1947 }
98f90dba 1948
a7f53e26
MA
1949 if (bs->blk) {
1950 blk_dev_change_media_cb(bs->blk, false);
1951 }
9ca11154 1952
98f90dba
ZYW
1953 /*throttling disk I/O limits*/
1954 if (bs->io_limits_enabled) {
1955 bdrv_io_limits_disable(bs);
1956 }
33384421
HR
1957
1958 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1959 g_free(ban);
1960 }
1961 QLIST_INIT(&bs->aio_notifiers);
b338082b
FB
1962}
1963
2bc93fed
MK
1964void bdrv_close_all(void)
1965{
1966 BlockDriverState *bs;
1967
dc364f4c 1968 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
1969 AioContext *aio_context = bdrv_get_aio_context(bs);
1970
1971 aio_context_acquire(aio_context);
2bc93fed 1972 bdrv_close(bs);
ed78cda3 1973 aio_context_release(aio_context);
2bc93fed
MK
1974 }
1975}
1976
88266f5a
SH
1977/* Check if any requests are in-flight (including throttled requests) */
1978static bool bdrv_requests_pending(BlockDriverState *bs)
1979{
1980 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1981 return true;
1982 }
cc0681c4
BC
1983 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1984 return true;
1985 }
1986 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
88266f5a
SH
1987 return true;
1988 }
1989 if (bs->file && bdrv_requests_pending(bs->file)) {
1990 return true;
1991 }
1992 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1993 return true;
1994 }
1995 return false;
1996}
1997
5b98db0a
SH
1998static bool bdrv_drain_one(BlockDriverState *bs)
1999{
2000 bool bs_busy;
2001
2002 bdrv_flush_io_queue(bs);
2003 bdrv_start_throttled_reqs(bs);
2004 bs_busy = bdrv_requests_pending(bs);
2005 bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
2006 return bs_busy;
2007}
2008
2009/*
2010 * Wait for pending requests to complete on a single BlockDriverState subtree
2011 *
2012 * See the warning in bdrv_drain_all(). This function can only be called if
2013 * you are sure nothing can generate I/O because you have op blockers
2014 * installed.
2015 *
2016 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
2017 * AioContext.
2018 */
2019void bdrv_drain(BlockDriverState *bs)
2020{
2021 while (bdrv_drain_one(bs)) {
2022 /* Keep iterating */
2023 }
2024}
2025
922453bc
SH
2026/*
2027 * Wait for pending requests to complete across all BlockDriverStates
2028 *
2029 * This function does not flush data to disk, use bdrv_flush_all() for that
2030 * after calling this function.
4c355d53
ZYW
2031 *
2032 * Note that completion of an asynchronous I/O operation can trigger any
2033 * number of other I/O operations on other devices---for example a coroutine
2034 * can be arbitrarily complex and a constant flow of I/O can come until the
2035 * coroutine is complete. Because of this, it is not possible to have a
2036 * function to drain a single device's I/O queue.
922453bc
SH
2037 */
2038void bdrv_drain_all(void)
2039{
88266f5a
SH
2040 /* Always run first iteration so any pending completion BHs run */
2041 bool busy = true;
922453bc
SH
2042 BlockDriverState *bs;
2043
69da3b0b
FZ
2044 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2045 AioContext *aio_context = bdrv_get_aio_context(bs);
2046
2047 aio_context_acquire(aio_context);
2048 if (bs->job) {
2049 block_job_pause(bs->job);
2050 }
2051 aio_context_release(aio_context);
2052 }
2053
88266f5a 2054 while (busy) {
9b536adc
SH
2055 busy = false;
2056
dc364f4c 2057 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
9b536adc 2058 AioContext *aio_context = bdrv_get_aio_context(bs);
9b536adc
SH
2059
2060 aio_context_acquire(aio_context);
5b98db0a 2061 busy |= bdrv_drain_one(bs);
9b536adc 2062 aio_context_release(aio_context);
9b536adc 2063 }
922453bc 2064 }
69da3b0b
FZ
2065
2066 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2067 AioContext *aio_context = bdrv_get_aio_context(bs);
2068
2069 aio_context_acquire(aio_context);
2070 if (bs->job) {
2071 block_job_resume(bs->job);
2072 }
2073 aio_context_release(aio_context);
2074 }
922453bc
SH
2075}
2076
dc364f4c
BC
2077/* make a BlockDriverState anonymous by removing from bdrv_state and
2078 * graph_bdrv_state list.
d22b2f41
RH
2079 Also, NULL terminate the device_name to prevent double remove */
2080void bdrv_make_anon(BlockDriverState *bs)
2081{
bfb197e0
MA
2082 /*
2083 * Take care to remove bs from bdrv_states only when it's actually
2084 * in it. Note that bs->device_list.tqe_prev is initially null,
2085 * and gets set to non-null by QTAILQ_INSERT_TAIL(). Establish
2086 * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
2087 * resetting it to null on remove.
2088 */
2089 if (bs->device_list.tqe_prev) {
dc364f4c 2090 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
bfb197e0 2091 bs->device_list.tqe_prev = NULL;
d22b2f41 2092 }
dc364f4c
BC
2093 if (bs->node_name[0] != '\0') {
2094 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
2095 }
2096 bs->node_name[0] = '\0';
d22b2f41
RH
2097}
2098
e023b2e2
PB
2099static void bdrv_rebind(BlockDriverState *bs)
2100{
2101 if (bs->drv && bs->drv->bdrv_rebind) {
2102 bs->drv->bdrv_rebind(bs);
2103 }
2104}
2105
4ddc07ca
PB
2106static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
2107 BlockDriverState *bs_src)
8802d1fd 2108{
4ddc07ca 2109 /* move some fields that need to stay attached to the device */
8802d1fd
JC
2110
2111 /* dev info */
1b7fd729 2112 bs_dest->guest_block_size = bs_src->guest_block_size;
4ddc07ca 2113 bs_dest->copy_on_read = bs_src->copy_on_read;
8802d1fd 2114
4ddc07ca 2115 bs_dest->enable_write_cache = bs_src->enable_write_cache;
c4a248a1 2116
cc0681c4
BC
2117 /* i/o throttled req */
2118 memcpy(&bs_dest->throttle_state,
2119 &bs_src->throttle_state,
2120 sizeof(ThrottleState));
2121 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
2122 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
4ddc07ca 2123 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
8802d1fd 2124
8802d1fd 2125 /* r/w error */
4ddc07ca
PB
2126 bs_dest->on_read_error = bs_src->on_read_error;
2127 bs_dest->on_write_error = bs_src->on_write_error;
8802d1fd
JC
2128
2129 /* i/o status */
4ddc07ca
PB
2130 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
2131 bs_dest->iostatus = bs_src->iostatus;
8802d1fd 2132
a9fc4408 2133 /* dirty bitmap */
e4654d2d 2134 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
a9fc4408 2135
9fcb0251
FZ
2136 /* reference count */
2137 bs_dest->refcnt = bs_src->refcnt;
2138
a9fc4408 2139 /* job */
4ddc07ca 2140 bs_dest->job = bs_src->job;
a9fc4408 2141
8802d1fd 2142 /* keep the same entry in bdrv_states */
dc364f4c 2143 bs_dest->device_list = bs_src->device_list;
7e7d56d9
MA
2144 bs_dest->blk = bs_src->blk;
2145
fbe40ff7
FZ
2146 memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2147 sizeof(bs_dest->op_blockers));
4ddc07ca 2148}
8802d1fd 2149
4ddc07ca
PB
2150/*
2151 * Swap bs contents for two image chains while they are live,
2152 * while keeping required fields on the BlockDriverState that is
2153 * actually attached to a device.
2154 *
2155 * This will modify the BlockDriverState fields, and swap contents
2156 * between bs_new and bs_old. Both bs_new and bs_old are modified.
2157 *
bfb197e0 2158 * bs_new must not be attached to a BlockBackend.
4ddc07ca
PB
2159 *
2160 * This function does not create any image files.
2161 */
2162void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2163{
2164 BlockDriverState tmp;
f6801b83 2165
90ce8a06
BC
2166 /* The code needs to swap the node_name but simply swapping node_list won't
2167 * work so first remove the nodes from the graph list, do the swap then
2168 * insert them back if needed.
2169 */
2170 if (bs_new->node_name[0] != '\0') {
2171 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2172 }
2173 if (bs_old->node_name[0] != '\0') {
2174 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2175 }
2176
bfb197e0 2177 /* bs_new must be unattached and shouldn't have anything fancy enabled */
7e7d56d9 2178 assert(!bs_new->blk);
e4654d2d 2179 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
4ddc07ca 2180 assert(bs_new->job == NULL);
4ddc07ca 2181 assert(bs_new->io_limits_enabled == false);
cc0681c4 2182 assert(!throttle_have_timer(&bs_new->throttle_state));
8802d1fd 2183
4ddc07ca
PB
2184 tmp = *bs_new;
2185 *bs_new = *bs_old;
2186 *bs_old = tmp;
a9fc4408 2187
4ddc07ca
PB
2188 /* there are some fields that should not be swapped, move them back */
2189 bdrv_move_feature_fields(&tmp, bs_old);
2190 bdrv_move_feature_fields(bs_old, bs_new);
2191 bdrv_move_feature_fields(bs_new, &tmp);
8802d1fd 2192
bfb197e0 2193 /* bs_new must remain unattached */
7e7d56d9 2194 assert(!bs_new->blk);
4ddc07ca
PB
2195
2196 /* Check a few fields that should remain attached to the device */
4ddc07ca 2197 assert(bs_new->job == NULL);
4ddc07ca 2198 assert(bs_new->io_limits_enabled == false);
cc0681c4 2199 assert(!throttle_have_timer(&bs_new->throttle_state));
e023b2e2 2200
90ce8a06
BC
2201 /* insert the nodes back into the graph node list if needed */
2202 if (bs_new->node_name[0] != '\0') {
2203 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2204 }
2205 if (bs_old->node_name[0] != '\0') {
2206 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2207 }
2208
e023b2e2 2209 bdrv_rebind(bs_new);
4ddc07ca
PB
2210 bdrv_rebind(bs_old);
2211}
2212
2213/*
2214 * Add new bs contents at the top of an image chain while the chain is
2215 * live, while keeping required fields on the top layer.
2216 *
2217 * This will modify the BlockDriverState fields, and swap contents
2218 * between bs_new and bs_top. Both bs_new and bs_top are modified.
2219 *
bfb197e0 2220 * bs_new must not be attached to a BlockBackend.
4ddc07ca
PB
2221 *
2222 * This function does not create any image files.
2223 */
2224void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2225{
2226 bdrv_swap(bs_new, bs_top);
2227
2228 /* The contents of 'tmp' will become bs_top, as we are
2229 * swapping bs_new and bs_top contents. */
8d24cce1 2230 bdrv_set_backing_hd(bs_top, bs_new);
8802d1fd
JC
2231}
2232
4f6fd349 2233static void bdrv_delete(BlockDriverState *bs)
b338082b 2234{
3e914655 2235 assert(!bs->job);
3718d8ab 2236 assert(bdrv_op_blocker_is_empty(bs));
4f6fd349 2237 assert(!bs->refcnt);
e4654d2d 2238 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
18846dee 2239
e1b5c52e
SH
2240 bdrv_close(bs);
2241
1b7bdbc1 2242 /* remove from list, if necessary */
d22b2f41 2243 bdrv_make_anon(bs);
34c6f050 2244
7267c094 2245 g_free(bs);
fc01f7e7
FB
2246}
2247
e97fc193
AL
2248/*
2249 * Run consistency checks on an image
2250 *
e076f338 2251 * Returns 0 if the check could be completed (it doesn't mean that the image is
a1c7273b 2252 * free of errors) or -errno when an internal error occurred. The results of the
e076f338 2253 * check are stored in res.
e97fc193 2254 */
4534ff54 2255int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
e97fc193 2256{
908bcd54
HR
2257 if (bs->drv == NULL) {
2258 return -ENOMEDIUM;
2259 }
e97fc193
AL
2260 if (bs->drv->bdrv_check == NULL) {
2261 return -ENOTSUP;
2262 }
2263
e076f338 2264 memset(res, 0, sizeof(*res));
4534ff54 2265 return bs->drv->bdrv_check(bs, res, fix);
e97fc193
AL
2266}
2267
8a426614
KW
2268#define COMMIT_BUF_SECTORS 2048
2269
33e3963e
FB
2270/* commit COW file into the raw image */
2271int bdrv_commit(BlockDriverState *bs)
2272{
19cb3738 2273 BlockDriver *drv = bs->drv;
72706ea4 2274 int64_t sector, total_sectors, length, backing_length;
8a426614 2275 int n, ro, open_flags;
0bce597d 2276 int ret = 0;
72706ea4 2277 uint8_t *buf = NULL;
33e3963e 2278
19cb3738
FB
2279 if (!drv)
2280 return -ENOMEDIUM;
6bb45158 2281
4dca4b63
NS
2282 if (!bs->backing_hd) {
2283 return -ENOTSUP;
33e3963e
FB
2284 }
2285
bb00021d
FZ
2286 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT_SOURCE, NULL) ||
2287 bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET, NULL)) {
2d3735d3
SH
2288 return -EBUSY;
2289 }
2290
4dca4b63 2291 ro = bs->backing_hd->read_only;
4dca4b63
NS
2292 open_flags = bs->backing_hd->open_flags;
2293
2294 if (ro) {
0bce597d
JC
2295 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2296 return -EACCES;
4dca4b63 2297 }
ea2384d3 2298 }
33e3963e 2299
72706ea4
JC
2300 length = bdrv_getlength(bs);
2301 if (length < 0) {
2302 ret = length;
2303 goto ro_cleanup;
2304 }
2305
2306 backing_length = bdrv_getlength(bs->backing_hd);
2307 if (backing_length < 0) {
2308 ret = backing_length;
2309 goto ro_cleanup;
2310 }
2311
2312 /* If our top snapshot is larger than the backing file image,
2313 * grow the backing file image if possible. If not possible,
2314 * we must return an error */
2315 if (length > backing_length) {
2316 ret = bdrv_truncate(bs->backing_hd, length);
2317 if (ret < 0) {
2318 goto ro_cleanup;
2319 }
2320 }
2321
2322 total_sectors = length >> BDRV_SECTOR_BITS;
857d4f46
KW
2323
2324 /* qemu_try_blockalign() for bs will choose an alignment that works for
2325 * bs->backing_hd as well, so no need to compare the alignment manually. */
2326 buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2327 if (buf == NULL) {
2328 ret = -ENOMEM;
2329 goto ro_cleanup;
2330 }
8a426614
KW
2331
2332 for (sector = 0; sector < total_sectors; sector += n) {
d663640c
PB
2333 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2334 if (ret < 0) {
2335 goto ro_cleanup;
2336 }
2337 if (ret) {
dabfa6cc
KW
2338 ret = bdrv_read(bs, sector, buf, n);
2339 if (ret < 0) {
8a426614
KW
2340 goto ro_cleanup;
2341 }
2342
dabfa6cc
KW
2343 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2344 if (ret < 0) {
8a426614
KW
2345 goto ro_cleanup;
2346 }
ea2384d3 2347 }
33e3963e 2348 }
95389c86 2349
1d44952f
CH
2350 if (drv->bdrv_make_empty) {
2351 ret = drv->bdrv_make_empty(bs);
dabfa6cc
KW
2352 if (ret < 0) {
2353 goto ro_cleanup;
2354 }
1d44952f
CH
2355 bdrv_flush(bs);
2356 }
95389c86 2357
3f5075ae
CH
2358 /*
2359 * Make sure all data we wrote to the backing device is actually
2360 * stable on disk.
2361 */
dabfa6cc 2362 if (bs->backing_hd) {
3f5075ae 2363 bdrv_flush(bs->backing_hd);
dabfa6cc 2364 }
4dca4b63 2365
dabfa6cc 2366 ret = 0;
4dca4b63 2367ro_cleanup:
857d4f46 2368 qemu_vfree(buf);
4dca4b63
NS
2369
2370 if (ro) {
0bce597d
JC
2371 /* ignoring error return here */
2372 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
4dca4b63
NS
2373 }
2374
1d44952f 2375 return ret;
33e3963e
FB
2376}
2377
e8877497 2378int bdrv_commit_all(void)
6ab4b5ab
MA
2379{
2380 BlockDriverState *bs;
2381
dc364f4c 2382 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
2383 AioContext *aio_context = bdrv_get_aio_context(bs);
2384
2385 aio_context_acquire(aio_context);
272d2d8e
JC
2386 if (bs->drv && bs->backing_hd) {
2387 int ret = bdrv_commit(bs);
2388 if (ret < 0) {
ed78cda3 2389 aio_context_release(aio_context);
272d2d8e
JC
2390 return ret;
2391 }
e8877497 2392 }
ed78cda3 2393 aio_context_release(aio_context);
6ab4b5ab 2394 }
e8877497 2395 return 0;
6ab4b5ab
MA
2396}
2397
dbffbdcf
SH
2398/**
2399 * Remove an active request from the tracked requests list
2400 *
2401 * This function should be called when a tracked request is completing.
2402 */
2403static void tracked_request_end(BdrvTrackedRequest *req)
2404{
2dbafdc0
KW
2405 if (req->serialising) {
2406 req->bs->serialising_in_flight--;
2407 }
2408
dbffbdcf 2409 QLIST_REMOVE(req, list);
f4658285 2410 qemu_co_queue_restart_all(&req->wait_queue);
dbffbdcf
SH
2411}
2412
2413/**
2414 * Add an active request to the tracked requests list
2415 */
2416static void tracked_request_begin(BdrvTrackedRequest *req,
2417 BlockDriverState *bs,
793ed47a
KW
2418 int64_t offset,
2419 unsigned int bytes, bool is_write)
dbffbdcf
SH
2420{
2421 *req = (BdrvTrackedRequest){
2422 .bs = bs,
2dbafdc0
KW
2423 .offset = offset,
2424 .bytes = bytes,
2425 .is_write = is_write,
2426 .co = qemu_coroutine_self(),
2427 .serialising = false,
7327145f
KW
2428 .overlap_offset = offset,
2429 .overlap_bytes = bytes,
dbffbdcf
SH
2430 };
2431
f4658285
SH
2432 qemu_co_queue_init(&req->wait_queue);
2433
dbffbdcf
SH
2434 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2435}
2436
e96126ff 2437static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2dbafdc0 2438{
7327145f 2439 int64_t overlap_offset = req->offset & ~(align - 1);
e96126ff
KW
2440 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2441 - overlap_offset;
7327145f 2442
2dbafdc0
KW
2443 if (!req->serialising) {
2444 req->bs->serialising_in_flight++;
2445 req->serialising = true;
2446 }
7327145f
KW
2447
2448 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2449 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2dbafdc0
KW
2450}
2451
d83947ac
SH
2452/**
2453 * Round a region to cluster boundaries
2454 */
343bded4
PB
2455void bdrv_round_to_clusters(BlockDriverState *bs,
2456 int64_t sector_num, int nb_sectors,
2457 int64_t *cluster_sector_num,
2458 int *cluster_nb_sectors)
d83947ac
SH
2459{
2460 BlockDriverInfo bdi;
2461
2462 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2463 *cluster_sector_num = sector_num;
2464 *cluster_nb_sectors = nb_sectors;
2465 } else {
2466 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2467 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2468 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2469 nb_sectors, c);
2470 }
2471}
2472
7327145f 2473static int bdrv_get_cluster_size(BlockDriverState *bs)
793ed47a
KW
2474{
2475 BlockDriverInfo bdi;
7327145f 2476 int ret;
793ed47a 2477
7327145f
KW
2478 ret = bdrv_get_info(bs, &bdi);
2479 if (ret < 0 || bdi.cluster_size == 0) {
2480 return bs->request_alignment;
793ed47a 2481 } else {
7327145f 2482 return bdi.cluster_size;
793ed47a
KW
2483 }
2484}
2485
f4658285 2486static bool tracked_request_overlaps(BdrvTrackedRequest *req,
793ed47a
KW
2487 int64_t offset, unsigned int bytes)
2488{
d83947ac 2489 /* aaaa bbbb */
7327145f 2490 if (offset >= req->overlap_offset + req->overlap_bytes) {
d83947ac
SH
2491 return false;
2492 }
2493 /* bbbb aaaa */
7327145f 2494 if (req->overlap_offset >= offset + bytes) {
d83947ac
SH
2495 return false;
2496 }
2497 return true;
f4658285
SH
2498}
2499
28de2dcd 2500static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
f4658285 2501{
2dbafdc0 2502 BlockDriverState *bs = self->bs;
f4658285
SH
2503 BdrvTrackedRequest *req;
2504 bool retry;
28de2dcd 2505 bool waited = false;
f4658285 2506
2dbafdc0 2507 if (!bs->serialising_in_flight) {
28de2dcd 2508 return false;
2dbafdc0
KW
2509 }
2510
f4658285
SH
2511 do {
2512 retry = false;
2513 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2dbafdc0 2514 if (req == self || (!req->serialising && !self->serialising)) {
65afd211
KW
2515 continue;
2516 }
7327145f
KW
2517 if (tracked_request_overlaps(req, self->overlap_offset,
2518 self->overlap_bytes))
2519 {
5f8b6491
SH
2520 /* Hitting this means there was a reentrant request, for
2521 * example, a block driver issuing nested requests. This must
2522 * never happen since it means deadlock.
2523 */
2524 assert(qemu_coroutine_self() != req->co);
2525
6460440f
KW
2526 /* If the request is already (indirectly) waiting for us, or
2527 * will wait for us as soon as it wakes up, then just go on
2528 * (instead of producing a deadlock in the former case). */
2529 if (!req->waiting_for) {
2530 self->waiting_for = req;
2531 qemu_co_queue_wait(&req->wait_queue);
2532 self->waiting_for = NULL;
2533 retry = true;
28de2dcd 2534 waited = true;
6460440f
KW
2535 break;
2536 }
f4658285
SH
2537 }
2538 }
2539 } while (retry);
28de2dcd
KW
2540
2541 return waited;
f4658285
SH
2542}
2543
756e6736
KW
2544/*
2545 * Return values:
2546 * 0 - success
2547 * -EINVAL - backing format specified, but no file
2548 * -ENOSPC - can't update the backing file because no space is left in the
2549 * image file header
2550 * -ENOTSUP - format driver doesn't support changing the backing file
2551 */
2552int bdrv_change_backing_file(BlockDriverState *bs,
2553 const char *backing_file, const char *backing_fmt)
2554{
2555 BlockDriver *drv = bs->drv;
469ef350 2556 int ret;
756e6736 2557
5f377794
PB
2558 /* Backing file format doesn't make sense without a backing file */
2559 if (backing_fmt && !backing_file) {
2560 return -EINVAL;
2561 }
2562
756e6736 2563 if (drv->bdrv_change_backing_file != NULL) {
469ef350 2564 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
756e6736 2565 } else {
469ef350 2566 ret = -ENOTSUP;
756e6736 2567 }
469ef350
PB
2568
2569 if (ret == 0) {
2570 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2571 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2572 }
2573 return ret;
756e6736
KW
2574}
2575
6ebdcee2
JC
2576/*
2577 * Finds the image layer in the chain that has 'bs' as its backing file.
2578 *
2579 * active is the current topmost image.
2580 *
2581 * Returns NULL if bs is not found in active's image chain,
2582 * or if active == bs.
4caf0fcd
JC
2583 *
2584 * Returns the bottommost base image if bs == NULL.
6ebdcee2
JC
2585 */
2586BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2587 BlockDriverState *bs)
2588{
4caf0fcd
JC
2589 while (active && bs != active->backing_hd) {
2590 active = active->backing_hd;
6ebdcee2
JC
2591 }
2592
4caf0fcd
JC
2593 return active;
2594}
6ebdcee2 2595
4caf0fcd
JC
2596/* Given a BDS, searches for the base layer. */
2597BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2598{
2599 return bdrv_find_overlay(bs, NULL);
6ebdcee2
JC
2600}
2601
2602typedef struct BlkIntermediateStates {
2603 BlockDriverState *bs;
2604 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2605} BlkIntermediateStates;
2606
2607
2608/*
2609 * Drops images above 'base' up to and including 'top', and sets the image
2610 * above 'top' to have base as its backing file.
2611 *
2612 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2613 * information in 'bs' can be properly updated.
2614 *
2615 * E.g., this will convert the following chain:
2616 * bottom <- base <- intermediate <- top <- active
2617 *
2618 * to
2619 *
2620 * bottom <- base <- active
2621 *
2622 * It is allowed for bottom==base, in which case it converts:
2623 *
2624 * base <- intermediate <- top <- active
2625 *
2626 * to
2627 *
2628 * base <- active
2629 *
54e26900
JC
2630 * If backing_file_str is non-NULL, it will be used when modifying top's
2631 * overlay image metadata.
2632 *
6ebdcee2
JC
2633 * Error conditions:
2634 * if active == top, that is considered an error
2635 *
2636 */
2637int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
54e26900 2638 BlockDriverState *base, const char *backing_file_str)
6ebdcee2
JC
2639{
2640 BlockDriverState *intermediate;
2641 BlockDriverState *base_bs = NULL;
2642 BlockDriverState *new_top_bs = NULL;
2643 BlkIntermediateStates *intermediate_state, *next;
2644 int ret = -EIO;
2645
2646 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2647 QSIMPLEQ_INIT(&states_to_delete);
2648
2649 if (!top->drv || !base->drv) {
2650 goto exit;
2651 }
2652
2653 new_top_bs = bdrv_find_overlay(active, top);
2654
2655 if (new_top_bs == NULL) {
2656 /* we could not find the image above 'top', this is an error */
2657 goto exit;
2658 }
2659
2660 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2661 * to do, no intermediate images */
2662 if (new_top_bs->backing_hd == base) {
2663 ret = 0;
2664 goto exit;
2665 }
2666
2667 intermediate = top;
2668
2669 /* now we will go down through the list, and add each BDS we find
2670 * into our deletion queue, until we hit the 'base'
2671 */
2672 while (intermediate) {
5839e53b 2673 intermediate_state = g_new0(BlkIntermediateStates, 1);
6ebdcee2
JC
2674 intermediate_state->bs = intermediate;
2675 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2676
2677 if (intermediate->backing_hd == base) {
2678 base_bs = intermediate->backing_hd;
2679 break;
2680 }
2681 intermediate = intermediate->backing_hd;
2682 }
2683 if (base_bs == NULL) {
2684 /* something went wrong, we did not end at the base. safely
2685 * unravel everything, and exit with error */
2686 goto exit;
2687 }
2688
2689 /* success - we can delete the intermediate states, and link top->base */
54e26900
JC
2690 backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2691 ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
6ebdcee2
JC
2692 base_bs->drv ? base_bs->drv->format_name : "");
2693 if (ret) {
2694 goto exit;
2695 }
920beae1 2696 bdrv_set_backing_hd(new_top_bs, base_bs);
6ebdcee2
JC
2697
2698 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2699 /* so that bdrv_close() does not recursively close the chain */
920beae1 2700 bdrv_set_backing_hd(intermediate_state->bs, NULL);
4f6fd349 2701 bdrv_unref(intermediate_state->bs);
6ebdcee2
JC
2702 }
2703 ret = 0;
2704
2705exit:
2706 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2707 g_free(intermediate_state);
2708 }
2709 return ret;
2710}
2711
2712
71d0770c
AL
2713static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2714 size_t size)
2715{
75af1f34 2716 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
1dd3a447
KW
2717 return -EIO;
2718 }
2719
c0191e76 2720 if (!bdrv_is_inserted(bs)) {
71d0770c 2721 return -ENOMEDIUM;
c0191e76 2722 }
71d0770c 2723
c0191e76 2724 if (offset < 0) {
71d0770c 2725 return -EIO;
c0191e76 2726 }
71d0770c
AL
2727
2728 return 0;
2729}
2730
2731static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2732 int nb_sectors)
2733{
75af1f34 2734 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
8f4754ed
KW
2735 return -EIO;
2736 }
2737
eb5a3165
JS
2738 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2739 nb_sectors * BDRV_SECTOR_SIZE);
71d0770c
AL
2740}
2741
1c9805a3
SH
2742typedef struct RwCo {
2743 BlockDriverState *bs;
775aa8b6 2744 int64_t offset;
1c9805a3
SH
2745 QEMUIOVector *qiov;
2746 bool is_write;
2747 int ret;
4105eaaa 2748 BdrvRequestFlags flags;
1c9805a3
SH
2749} RwCo;
2750
2751static void coroutine_fn bdrv_rw_co_entry(void *opaque)
fc01f7e7 2752{
1c9805a3 2753 RwCo *rwco = opaque;
ea2384d3 2754
1c9805a3 2755 if (!rwco->is_write) {
775aa8b6
KW
2756 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2757 rwco->qiov->size, rwco->qiov,
4105eaaa 2758 rwco->flags);
775aa8b6
KW
2759 } else {
2760 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2761 rwco->qiov->size, rwco->qiov,
2762 rwco->flags);
1c9805a3
SH
2763 }
2764}
e7a8a783 2765
1c9805a3 2766/*
8d3b1a2d 2767 * Process a vectored synchronous request using coroutines
1c9805a3 2768 */
775aa8b6
KW
2769static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2770 QEMUIOVector *qiov, bool is_write,
2771 BdrvRequestFlags flags)
1c9805a3 2772{
1c9805a3
SH
2773 Coroutine *co;
2774 RwCo rwco = {
2775 .bs = bs,
775aa8b6 2776 .offset = offset,
8d3b1a2d 2777 .qiov = qiov,
1c9805a3
SH
2778 .is_write = is_write,
2779 .ret = NOT_DONE,
4105eaaa 2780 .flags = flags,
1c9805a3 2781 };
e7a8a783 2782
498e386c
ZYW
2783 /**
2784 * In sync call context, when the vcpu is blocked, this throttling timer
2785 * will not fire; so the I/O throttling function has to be disabled here
2786 * if it has been enabled.
2787 */
2788 if (bs->io_limits_enabled) {
2789 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2790 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2791 bdrv_io_limits_disable(bs);
2792 }
2793
1c9805a3
SH
2794 if (qemu_in_coroutine()) {
2795 /* Fast-path if already in coroutine context */
2796 bdrv_rw_co_entry(&rwco);
2797 } else {
2572b37a
SH
2798 AioContext *aio_context = bdrv_get_aio_context(bs);
2799
1c9805a3
SH
2800 co = qemu_coroutine_create(bdrv_rw_co_entry);
2801 qemu_coroutine_enter(co, &rwco);
2802 while (rwco.ret == NOT_DONE) {
2572b37a 2803 aio_poll(aio_context, true);
1c9805a3
SH
2804 }
2805 }
2806 return rwco.ret;
2807}
b338082b 2808
8d3b1a2d
KW
2809/*
2810 * Process a synchronous request using coroutines
2811 */
2812static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
4105eaaa 2813 int nb_sectors, bool is_write, BdrvRequestFlags flags)
8d3b1a2d
KW
2814{
2815 QEMUIOVector qiov;
2816 struct iovec iov = {
2817 .iov_base = (void *)buf,
2818 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2819 };
2820
75af1f34 2821 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
da15ee51
KW
2822 return -EINVAL;
2823 }
2824
8d3b1a2d 2825 qemu_iovec_init_external(&qiov, &iov, 1);
775aa8b6
KW
2826 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2827 &qiov, is_write, flags);
8d3b1a2d
KW
2828}
2829
1c9805a3
SH
2830/* return < 0 if error. See bdrv_write() for the return codes */
2831int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2832 uint8_t *buf, int nb_sectors)
2833{
4105eaaa 2834 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
fc01f7e7
FB
2835}
2836
07d27a44
MA
2837/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2838int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2839 uint8_t *buf, int nb_sectors)
2840{
2841 bool enabled;
2842 int ret;
2843
2844 enabled = bs->io_limits_enabled;
2845 bs->io_limits_enabled = false;
4e7395e8 2846 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
07d27a44
MA
2847 bs->io_limits_enabled = enabled;
2848 return ret;
2849}
2850
5fafdf24 2851/* Return < 0 if error. Important errors are:
19cb3738
FB
2852 -EIO generic I/O error (may happen for all errors)
2853 -ENOMEDIUM No media inserted.
2854 -EINVAL Invalid sector number or nb_sectors
2855 -EACCES Trying to write a read-only device
2856*/
5fafdf24 2857int bdrv_write(BlockDriverState *bs, int64_t sector_num,
fc01f7e7
FB
2858 const uint8_t *buf, int nb_sectors)
2859{
4105eaaa 2860 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
83f64091
FB
2861}
2862
aa7bfbff
PL
2863int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2864 int nb_sectors, BdrvRequestFlags flags)
4105eaaa
PL
2865{
2866 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
aa7bfbff 2867 BDRV_REQ_ZERO_WRITE | flags);
8d3b1a2d
KW
2868}
2869
d75cbb5e
PL
2870/*
2871 * Completely zero out a block device with the help of bdrv_write_zeroes.
2872 * The operation is sped up by checking the block status and only writing
2873 * zeroes to the device if they currently do not return zeroes. Optional
2874 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2875 *
2876 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2877 */
2878int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2879{
d32f7c10 2880 int64_t target_sectors, ret, nb_sectors, sector_num = 0;
d75cbb5e
PL
2881 int n;
2882
d32f7c10
MA
2883 target_sectors = bdrv_nb_sectors(bs);
2884 if (target_sectors < 0) {
2885 return target_sectors;
9ce10c0b 2886 }
9ce10c0b 2887
d75cbb5e 2888 for (;;) {
75af1f34 2889 nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
d75cbb5e
PL
2890 if (nb_sectors <= 0) {
2891 return 0;
2892 }
d75cbb5e 2893 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
3d94ce60
PL
2894 if (ret < 0) {
2895 error_report("error getting block status at sector %" PRId64 ": %s",
2896 sector_num, strerror(-ret));
2897 return ret;
2898 }
d75cbb5e
PL
2899 if (ret & BDRV_BLOCK_ZERO) {
2900 sector_num += n;
2901 continue;
2902 }
2903 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2904 if (ret < 0) {
2905 error_report("error writing zeroes at sector %" PRId64 ": %s",
2906 sector_num, strerror(-ret));
2907 return ret;
2908 }
2909 sector_num += n;
2910 }
2911}
2912
a3ef6571 2913int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
83f64091 2914{
a3ef6571
KW
2915 QEMUIOVector qiov;
2916 struct iovec iov = {
2917 .iov_base = (void *)buf,
2918 .iov_len = bytes,
2919 };
9a8c4cce 2920 int ret;
83f64091 2921
a3ef6571
KW
2922 if (bytes < 0) {
2923 return -EINVAL;
83f64091
FB
2924 }
2925
a3ef6571
KW
2926 qemu_iovec_init_external(&qiov, &iov, 1);
2927 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2928 if (ret < 0) {
2929 return ret;
83f64091 2930 }
a3ef6571
KW
2931
2932 return bytes;
83f64091
FB
2933}
2934
8d3b1a2d 2935int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
83f64091 2936{
9a8c4cce 2937 int ret;
83f64091 2938
8407d5d7
KW
2939 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2940 if (ret < 0) {
2941 return ret;
83f64091
FB
2942 }
2943
8d3b1a2d
KW
2944 return qiov->size;
2945}
2946
2947int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
8407d5d7 2948 const void *buf, int bytes)
8d3b1a2d
KW
2949{
2950 QEMUIOVector qiov;
2951 struct iovec iov = {
2952 .iov_base = (void *) buf,
8407d5d7 2953 .iov_len = bytes,
8d3b1a2d
KW
2954 };
2955
8407d5d7
KW
2956 if (bytes < 0) {
2957 return -EINVAL;
2958 }
2959
8d3b1a2d
KW
2960 qemu_iovec_init_external(&qiov, &iov, 1);
2961 return bdrv_pwritev(bs, offset, &qiov);
83f64091 2962}
83f64091 2963
f08145fe
KW
2964/*
2965 * Writes to the file and ensures that no writes are reordered across this
2966 * request (acts as a barrier)
2967 *
2968 * Returns 0 on success, -errno in error cases.
2969 */
2970int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2971 const void *buf, int count)
2972{
2973 int ret;
2974
2975 ret = bdrv_pwrite(bs, offset, buf, count);
2976 if (ret < 0) {
2977 return ret;
2978 }
2979
f05fa4ad
PB
2980 /* No flush needed for cache modes that already do it */
2981 if (bs->enable_write_cache) {
f08145fe
KW
2982 bdrv_flush(bs);
2983 }
2984
2985 return 0;
2986}
2987
470c0504 2988static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
ab185921
SH
2989 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2990{
2991 /* Perform I/O through a temporary buffer so that users who scribble over
2992 * their read buffer while the operation is in progress do not end up
2993 * modifying the image file. This is critical for zero-copy guest I/O
2994 * where anything might happen inside guest memory.
2995 */
2996 void *bounce_buffer;
2997
79c053bd 2998 BlockDriver *drv = bs->drv;
ab185921
SH
2999 struct iovec iov;
3000 QEMUIOVector bounce_qiov;
3001 int64_t cluster_sector_num;
3002 int cluster_nb_sectors;
3003 size_t skip_bytes;
3004 int ret;
3005
3006 /* Cover entire cluster so no additional backing file I/O is required when
3007 * allocating cluster in the image file.
3008 */
343bded4
PB
3009 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
3010 &cluster_sector_num, &cluster_nb_sectors);
ab185921 3011
470c0504
SH
3012 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
3013 cluster_sector_num, cluster_nb_sectors);
ab185921
SH
3014
3015 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
857d4f46
KW
3016 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
3017 if (bounce_buffer == NULL) {
3018 ret = -ENOMEM;
3019 goto err;
3020 }
3021
ab185921
SH
3022 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
3023
79c053bd
SH
3024 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
3025 &bounce_qiov);
ab185921
SH
3026 if (ret < 0) {
3027 goto err;
3028 }
3029
79c053bd
SH
3030 if (drv->bdrv_co_write_zeroes &&
3031 buffer_is_zero(bounce_buffer, iov.iov_len)) {
621f0589 3032 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
aa7bfbff 3033 cluster_nb_sectors, 0);
79c053bd 3034 } else {
f05fa4ad
PB
3035 /* This does not change the data on the disk, it is not necessary
3036 * to flush even in cache=writethrough mode.
3037 */
79c053bd 3038 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
ab185921 3039 &bounce_qiov);
79c053bd
SH
3040 }
3041
ab185921
SH
3042 if (ret < 0) {
3043 /* It might be okay to ignore write errors for guest requests. If this
3044 * is a deliberate copy-on-read then we don't want to ignore the error.
3045 * Simply report it in all cases.
3046 */
3047 goto err;
3048 }
3049
3050 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
03396148
MT
3051 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
3052 nb_sectors * BDRV_SECTOR_SIZE);
ab185921
SH
3053
3054err:
3055 qemu_vfree(bounce_buffer);
3056 return ret;
3057}
3058
c5fbe571 3059/*
d0c7f642
KW
3060 * Forwards an already correctly aligned request to the BlockDriver. This
3061 * handles copy on read and zeroing after EOF; any other features must be
3062 * implemented by the caller.
c5fbe571 3063 */
d0c7f642 3064static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
65afd211 3065 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
ec746e10 3066 int64_t align, QEMUIOVector *qiov, int flags)
da1fa91d
KW
3067{
3068 BlockDriver *drv = bs->drv;
dbffbdcf 3069 int ret;
da1fa91d 3070
d0c7f642
KW
3071 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3072 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
da1fa91d 3073
d0c7f642
KW
3074 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3075 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
8eb029c2 3076 assert(!qiov || bytes == qiov->size);
d0c7f642
KW
3077
3078 /* Handle Copy on Read and associated serialisation */
470c0504 3079 if (flags & BDRV_REQ_COPY_ON_READ) {
7327145f
KW
3080 /* If we touch the same cluster it counts as an overlap. This
3081 * guarantees that allocating writes will be serialized and not race
3082 * with each other for the same cluster. For example, in copy-on-read
3083 * it ensures that the CoR read and write operations are atomic and
3084 * guest writes cannot interleave between them. */
3085 mark_request_serialising(req, bdrv_get_cluster_size(bs));
470c0504
SH
3086 }
3087
2dbafdc0 3088 wait_serialising_requests(req);
f4658285 3089
470c0504 3090 if (flags & BDRV_REQ_COPY_ON_READ) {
ab185921
SH
3091 int pnum;
3092
bdad13b9 3093 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
ab185921
SH
3094 if (ret < 0) {
3095 goto out;
3096 }
3097
3098 if (!ret || pnum != nb_sectors) {
470c0504 3099 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
ab185921
SH
3100 goto out;
3101 }
3102 }
3103
d0c7f642 3104 /* Forward the request to the BlockDriver */
c0191e76 3105 if (!bs->zero_beyond_eof) {
893a8f62
MK
3106 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3107 } else {
c0191e76 3108 /* Read zeros after EOF */
4049082c 3109 int64_t total_sectors, max_nb_sectors;
893a8f62 3110
4049082c
MA
3111 total_sectors = bdrv_nb_sectors(bs);
3112 if (total_sectors < 0) {
3113 ret = total_sectors;
893a8f62
MK
3114 goto out;
3115 }
3116
5f5bcd80
KW
3117 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3118 align >> BDRV_SECTOR_BITS);
e012b78c
PB
3119 if (nb_sectors < max_nb_sectors) {
3120 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3121 } else if (max_nb_sectors > 0) {
33f461e0 3122 QEMUIOVector local_qiov;
33f461e0
KW
3123
3124 qemu_iovec_init(&local_qiov, qiov->niov);
3125 qemu_iovec_concat(&local_qiov, qiov, 0,
e012b78c 3126 max_nb_sectors * BDRV_SECTOR_SIZE);
33f461e0 3127
e012b78c 3128 ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors,
33f461e0
KW
3129 &local_qiov);
3130
3131 qemu_iovec_destroy(&local_qiov);
893a8f62
MK
3132 } else {
3133 ret = 0;
3134 }
3135
3136 /* Reading beyond end of file is supposed to produce zeroes */
3137 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3138 uint64_t offset = MAX(0, total_sectors - sector_num);
3139 uint64_t bytes = (sector_num + nb_sectors - offset) *
3140 BDRV_SECTOR_SIZE;
3141 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3142 }
3143 }
ab185921
SH
3144
3145out:
dbffbdcf 3146 return ret;
da1fa91d
KW
3147}
3148
fc3959e4
FZ
3149static inline uint64_t bdrv_get_align(BlockDriverState *bs)
3150{
3151 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3152 return MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3153}
3154
3155static inline bool bdrv_req_is_aligned(BlockDriverState *bs,
3156 int64_t offset, size_t bytes)
3157{
3158 int64_t align = bdrv_get_align(bs);
3159 return !(offset & (align - 1) || (bytes & (align - 1)));
3160}
3161
d0c7f642
KW
3162/*
3163 * Handle a read request in coroutine context
3164 */
1b0288ae
KW
3165static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3166 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
d0c7f642
KW
3167 BdrvRequestFlags flags)
3168{
3169 BlockDriver *drv = bs->drv;
65afd211
KW
3170 BdrvTrackedRequest req;
3171
fc3959e4 3172 uint64_t align = bdrv_get_align(bs);
1b0288ae
KW
3173 uint8_t *head_buf = NULL;
3174 uint8_t *tail_buf = NULL;
3175 QEMUIOVector local_qiov;
3176 bool use_local_qiov = false;
d0c7f642
KW
3177 int ret;
3178
3179 if (!drv) {
3180 return -ENOMEDIUM;
3181 }
b9c64947
HR
3182
3183 ret = bdrv_check_byte_request(bs, offset, bytes);
3184 if (ret < 0) {
3185 return ret;
d0c7f642
KW
3186 }
3187
3188 if (bs->copy_on_read) {
3189 flags |= BDRV_REQ_COPY_ON_READ;
3190 }
3191
3192 /* throttling disk I/O */
3193 if (bs->io_limits_enabled) {
d5103588 3194 bdrv_io_limits_intercept(bs, bytes, false);
1b0288ae
KW
3195 }
3196
3197 /* Align read if necessary by padding qiov */
3198 if (offset & (align - 1)) {
3199 head_buf = qemu_blockalign(bs, align);
3200 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3201 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3202 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3203 use_local_qiov = true;
3204
3205 bytes += offset & (align - 1);
3206 offset = offset & ~(align - 1);
3207 }
3208
3209 if ((offset + bytes) & (align - 1)) {
3210 if (!use_local_qiov) {
3211 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3212 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3213 use_local_qiov = true;
3214 }
3215 tail_buf = qemu_blockalign(bs, align);
3216 qemu_iovec_add(&local_qiov, tail_buf,
3217 align - ((offset + bytes) & (align - 1)));
3218
3219 bytes = ROUND_UP(bytes, align);
3220 }
3221
65afd211 3222 tracked_request_begin(&req, bs, offset, bytes, false);
ec746e10 3223 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
1b0288ae
KW
3224 use_local_qiov ? &local_qiov : qiov,
3225 flags);
65afd211 3226 tracked_request_end(&req);
1b0288ae
KW
3227
3228 if (use_local_qiov) {
3229 qemu_iovec_destroy(&local_qiov);
3230 qemu_vfree(head_buf);
3231 qemu_vfree(tail_buf);
d0c7f642
KW
3232 }
3233
d0c7f642
KW
3234 return ret;
3235}
3236
1b0288ae
KW
3237static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3238 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3239 BdrvRequestFlags flags)
3240{
75af1f34 3241 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1b0288ae
KW
3242 return -EINVAL;
3243 }
3244
3245 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3246 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3247}
3248
c5fbe571 3249int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
da1fa91d
KW
3250 int nb_sectors, QEMUIOVector *qiov)
3251{
c5fbe571 3252 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
da1fa91d 3253
470c0504
SH
3254 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3255}
3256
3257int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3258 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3259{
3260 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3261
3262 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3263 BDRV_REQ_COPY_ON_READ);
c5fbe571
SH
3264}
3265
98764152 3266#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
c31cb707 3267
f08f2dda 3268static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
aa7bfbff 3269 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
f08f2dda
SH
3270{
3271 BlockDriver *drv = bs->drv;
3272 QEMUIOVector qiov;
c31cb707
PL
3273 struct iovec iov = {0};
3274 int ret = 0;
f08f2dda 3275
75af1f34
PL
3276 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
3277 BDRV_REQUEST_MAX_SECTORS);
621f0589 3278
c31cb707
PL
3279 while (nb_sectors > 0 && !ret) {
3280 int num = nb_sectors;
3281
b8d71c09
PB
3282 /* Align request. Block drivers can expect the "bulk" of the request
3283 * to be aligned.
3284 */
3285 if (bs->bl.write_zeroes_alignment
3286 && num > bs->bl.write_zeroes_alignment) {
3287 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3288 /* Make a small request up to the first aligned sector. */
c31cb707 3289 num = bs->bl.write_zeroes_alignment;
b8d71c09
PB
3290 num -= sector_num % bs->bl.write_zeroes_alignment;
3291 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3292 /* Shorten the request to the last aligned sector. num cannot
3293 * underflow because num > bs->bl.write_zeroes_alignment.
3294 */
3295 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
c31cb707 3296 }
621f0589 3297 }
f08f2dda 3298
c31cb707
PL
3299 /* limit request size */
3300 if (num > max_write_zeroes) {
3301 num = max_write_zeroes;
3302 }
3303
3304 ret = -ENOTSUP;
3305 /* First try the efficient write zeroes operation */
3306 if (drv->bdrv_co_write_zeroes) {
3307 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3308 }
3309
3310 if (ret == -ENOTSUP) {
3311 /* Fall back to bounce buffer if write zeroes is unsupported */
095e4fa4 3312 int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
98764152 3313 MAX_WRITE_ZEROES_BOUNCE_BUFFER);
095e4fa4 3314 num = MIN(num, max_xfer_len);
c31cb707
PL
3315 iov.iov_len = num * BDRV_SECTOR_SIZE;
3316 if (iov.iov_base == NULL) {
857d4f46
KW
3317 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3318 if (iov.iov_base == NULL) {
3319 ret = -ENOMEM;
3320 goto fail;
3321 }
b8d71c09 3322 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
c31cb707
PL
3323 }
3324 qemu_iovec_init_external(&qiov, &iov, 1);
f08f2dda 3325
c31cb707 3326 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
b8d71c09
PB
3327
3328 /* Keep bounce buffer around if it is big enough for all
3329 * all future requests.
3330 */
095e4fa4 3331 if (num < max_xfer_len) {
b8d71c09
PB
3332 qemu_vfree(iov.iov_base);
3333 iov.iov_base = NULL;
3334 }
c31cb707
PL
3335 }
3336
3337 sector_num += num;
3338 nb_sectors -= num;
3339 }
f08f2dda 3340
857d4f46 3341fail:
f08f2dda
SH
3342 qemu_vfree(iov.iov_base);
3343 return ret;
3344}
3345
c5fbe571 3346/*
b404f720 3347 * Forwards an already correctly aligned write request to the BlockDriver.
c5fbe571 3348 */
b404f720 3349static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
65afd211
KW
3350 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3351 QEMUIOVector *qiov, int flags)
c5fbe571
SH
3352{
3353 BlockDriver *drv = bs->drv;
28de2dcd 3354 bool waited;
6b7cb247 3355 int ret;
da1fa91d 3356
b404f720
KW
3357 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3358 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
f4658285 3359
b404f720
KW
3360 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3361 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
8eb029c2 3362 assert(!qiov || bytes == qiov->size);
cc0681c4 3363
28de2dcd
KW
3364 waited = wait_serialising_requests(req);
3365 assert(!waited || !req->serialising);
af91f9a7
KW
3366 assert(req->overlap_offset <= offset);
3367 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
244eadef 3368
65afd211 3369 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
d616b224 3370
465bee1d
PL
3371 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3372 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3373 qemu_iovec_is_zero(qiov)) {
3374 flags |= BDRV_REQ_ZERO_WRITE;
3375 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3376 flags |= BDRV_REQ_MAY_UNMAP;
3377 }
3378 }
3379
d616b224
SH
3380 if (ret < 0) {
3381 /* Do nothing, write notifier decided to fail this request */
3382 } else if (flags & BDRV_REQ_ZERO_WRITE) {
9e1cb96d 3383 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
aa7bfbff 3384 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
f08f2dda 3385 } else {
9e1cb96d 3386 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
f08f2dda
SH
3387 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3388 }
9e1cb96d 3389 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
6b7cb247 3390
f05fa4ad
PB
3391 if (ret == 0 && !bs->enable_write_cache) {
3392 ret = bdrv_co_flush(bs);
3393 }
3394
e4654d2d 3395 bdrv_set_dirty(bs, sector_num, nb_sectors);
da1fa91d 3396
5366d0c8 3397 block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
5e5a94b6 3398
c0191e76 3399 if (ret >= 0) {
df2a6f29
PB
3400 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3401 }
da1fa91d 3402
6b7cb247 3403 return ret;
da1fa91d
KW
3404}
3405
b404f720
KW
3406/*
3407 * Handle a write request in coroutine context
3408 */
6601553e
KW
3409static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3410 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
b404f720
KW
3411 BdrvRequestFlags flags)
3412{
65afd211 3413 BdrvTrackedRequest req;
fc3959e4 3414 uint64_t align = bdrv_get_align(bs);
3b8242e0
KW
3415 uint8_t *head_buf = NULL;
3416 uint8_t *tail_buf = NULL;
3417 QEMUIOVector local_qiov;
3418 bool use_local_qiov = false;
b404f720
KW
3419 int ret;
3420
3421 if (!bs->drv) {
3422 return -ENOMEDIUM;
3423 }
3424 if (bs->read_only) {
3425 return -EACCES;
3426 }
b9c64947
HR
3427
3428 ret = bdrv_check_byte_request(bs, offset, bytes);
3429 if (ret < 0) {
3430 return ret;
b404f720
KW
3431 }
3432
b404f720
KW
3433 /* throttling disk I/O */
3434 if (bs->io_limits_enabled) {
d5103588 3435 bdrv_io_limits_intercept(bs, bytes, true);
b404f720
KW
3436 }
3437
3b8242e0
KW
3438 /*
3439 * Align write if necessary by performing a read-modify-write cycle.
3440 * Pad qiov with the read parts and be sure to have a tracked request not
3441 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3442 */
65afd211 3443 tracked_request_begin(&req, bs, offset, bytes, true);
3b8242e0
KW
3444
3445 if (offset & (align - 1)) {
3446 QEMUIOVector head_qiov;
3447 struct iovec head_iov;
3448
3449 mark_request_serialising(&req, align);
3450 wait_serialising_requests(&req);
3451
3452 head_buf = qemu_blockalign(bs, align);
3453 head_iov = (struct iovec) {
3454 .iov_base = head_buf,
3455 .iov_len = align,
3456 };
3457 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3458
9e1cb96d 3459 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3b8242e0
KW
3460 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3461 align, &head_qiov, 0);
3462 if (ret < 0) {
3463 goto fail;
3464 }
9e1cb96d 3465 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3b8242e0
KW
3466
3467 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3468 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3469 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3470 use_local_qiov = true;
3471
3472 bytes += offset & (align - 1);
3473 offset = offset & ~(align - 1);
3474 }
3475
3476 if ((offset + bytes) & (align - 1)) {
3477 QEMUIOVector tail_qiov;
3478 struct iovec tail_iov;
3479 size_t tail_bytes;
28de2dcd 3480 bool waited;
3b8242e0
KW
3481
3482 mark_request_serialising(&req, align);
28de2dcd
KW
3483 waited = wait_serialising_requests(&req);
3484 assert(!waited || !use_local_qiov);
3b8242e0
KW
3485
3486 tail_buf = qemu_blockalign(bs, align);
3487 tail_iov = (struct iovec) {
3488 .iov_base = tail_buf,
3489 .iov_len = align,
3490 };
3491 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3492
9e1cb96d 3493 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3b8242e0
KW
3494 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3495 align, &tail_qiov, 0);
3496 if (ret < 0) {
3497 goto fail;
3498 }
9e1cb96d 3499 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3b8242e0
KW
3500
3501 if (!use_local_qiov) {
3502 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3503 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3504 use_local_qiov = true;
3505 }
3506
3507 tail_bytes = (offset + bytes) & (align - 1);
3508 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3509
3510 bytes = ROUND_UP(bytes, align);
3511 }
3512
fc3959e4
FZ
3513 if (use_local_qiov) {
3514 /* Local buffer may have non-zero data. */
3515 flags &= ~BDRV_REQ_ZERO_WRITE;
3516 }
3b8242e0
KW
3517 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3518 use_local_qiov ? &local_qiov : qiov,
3519 flags);
3520
3521fail:
65afd211 3522 tracked_request_end(&req);
b404f720 3523
3b8242e0
KW
3524 if (use_local_qiov) {
3525 qemu_iovec_destroy(&local_qiov);
3b8242e0 3526 }
99c4a85c
KW
3527 qemu_vfree(head_buf);
3528 qemu_vfree(tail_buf);
3b8242e0 3529
b404f720
KW
3530 return ret;
3531}
3532
6601553e
KW
3533static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3534 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3535 BdrvRequestFlags flags)
3536{
75af1f34 3537 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
6601553e
KW
3538 return -EINVAL;
3539 }
3540
3541 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3542 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3543}
3544
c5fbe571
SH
3545int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3546 int nb_sectors, QEMUIOVector *qiov)
3547{
3548 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3549
f08f2dda
SH
3550 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3551}
3552
3553int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
aa7bfbff
PL
3554 int64_t sector_num, int nb_sectors,
3555 BdrvRequestFlags flags)
f08f2dda 3556{
fc3959e4
FZ
3557 int ret;
3558
94d6ff21 3559 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
f08f2dda 3560
d32f35cb
PL
3561 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3562 flags &= ~BDRV_REQ_MAY_UNMAP;
3563 }
fc3959e4
FZ
3564 if (bdrv_req_is_aligned(bs, sector_num << BDRV_SECTOR_BITS,
3565 nb_sectors << BDRV_SECTOR_BITS)) {
3566 ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3567 BDRV_REQ_ZERO_WRITE | flags);
3568 } else {
3569 uint8_t *buf;
3570 QEMUIOVector local_qiov;
3571 size_t bytes = nb_sectors << BDRV_SECTOR_BITS;
3572
3573 buf = qemu_memalign(bdrv_opt_mem_align(bs), bytes);
3574 memset(buf, 0, bytes);
3575 qemu_iovec_init(&local_qiov, 1);
3576 qemu_iovec_add(&local_qiov, buf, bytes);
d32f35cb 3577
fc3959e4
FZ
3578 ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, &local_qiov,
3579 BDRV_REQ_ZERO_WRITE | flags);
3580 qemu_vfree(buf);
3581 }
3582 return ret;
c5fbe571
SH
3583}
3584
83f64091
FB
3585/**
3586 * Truncate file to 'offset' bytes (needed only for file protocols)
3587 */
3588int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3589{
3590 BlockDriver *drv = bs->drv;
51762288 3591 int ret;
83f64091 3592 if (!drv)
19cb3738 3593 return -ENOMEDIUM;
83f64091
FB
3594 if (!drv->bdrv_truncate)
3595 return -ENOTSUP;
59f2689d
NS
3596 if (bs->read_only)
3597 return -EACCES;
9c75e168 3598
51762288
SH
3599 ret = drv->bdrv_truncate(bs, offset);
3600 if (ret == 0) {
3601 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
a7f53e26
MA
3602 if (bs->blk) {
3603 blk_dev_resize_cb(bs->blk);
3604 }
51762288
SH
3605 }
3606 return ret;
83f64091
FB
3607}
3608
4a1d5e1f
FZ
3609/**
3610 * Length of a allocated file in bytes. Sparse files are counted by actual
3611 * allocated space. Return < 0 if error or unknown.
3612 */
3613int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3614{
3615 BlockDriver *drv = bs->drv;
3616 if (!drv) {
3617 return -ENOMEDIUM;
3618 }
3619 if (drv->bdrv_get_allocated_file_size) {
3620 return drv->bdrv_get_allocated_file_size(bs);
3621 }
3622 if (bs->file) {
3623 return bdrv_get_allocated_file_size(bs->file);
3624 }
3625 return -ENOTSUP;
3626}
3627
83f64091 3628/**
65a9bb25 3629 * Return number of sectors on success, -errno on error.
83f64091 3630 */
65a9bb25 3631int64_t bdrv_nb_sectors(BlockDriverState *bs)
83f64091
FB
3632{
3633 BlockDriver *drv = bs->drv;
65a9bb25 3634
83f64091 3635 if (!drv)
19cb3738 3636 return -ENOMEDIUM;
51762288 3637
b94a2610
KW
3638 if (drv->has_variable_length) {
3639 int ret = refresh_total_sectors(bs, bs->total_sectors);
3640 if (ret < 0) {
3641 return ret;
46a4e4e6 3642 }
83f64091 3643 }
65a9bb25
MA
3644 return bs->total_sectors;
3645}
3646
3647/**
3648 * Return length in bytes on success, -errno on error.
3649 * The length is always a multiple of BDRV_SECTOR_SIZE.
3650 */
3651int64_t bdrv_getlength(BlockDriverState *bs)
3652{
3653 int64_t ret = bdrv_nb_sectors(bs);
3654
3655 return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
fc01f7e7
FB
3656}
3657
19cb3738 3658/* return 0 as number of sectors if no device present or error */
96b8f136 3659void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
fc01f7e7 3660{
65a9bb25
MA
3661 int64_t nb_sectors = bdrv_nb_sectors(bs);
3662
3663 *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
fc01f7e7 3664}
cf98951b 3665
ff06f5f3
PB
3666void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3667 BlockdevOnError on_write_error)
abd7f68d
MA
3668{
3669 bs->on_read_error = on_read_error;
3670 bs->on_write_error = on_write_error;
3671}
3672
1ceee0d5 3673BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
abd7f68d
MA
3674{
3675 return is_read ? bs->on_read_error : bs->on_write_error;
3676}
3677
3e1caa5f
PB
3678BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3679{
3680 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3681
3682 switch (on_err) {
3683 case BLOCKDEV_ON_ERROR_ENOSPC:
a589569f
WX
3684 return (error == ENOSPC) ?
3685 BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3e1caa5f 3686 case BLOCKDEV_ON_ERROR_STOP:
a589569f 3687 return BLOCK_ERROR_ACTION_STOP;
3e1caa5f 3688 case BLOCKDEV_ON_ERROR_REPORT:
a589569f 3689 return BLOCK_ERROR_ACTION_REPORT;
3e1caa5f 3690 case BLOCKDEV_ON_ERROR_IGNORE:
a589569f 3691 return BLOCK_ERROR_ACTION_IGNORE;
3e1caa5f
PB
3692 default:
3693 abort();
3694 }
3695}
3696
c7c2ff0c
LC
3697static void send_qmp_error_event(BlockDriverState *bs,
3698 BlockErrorAction action,
3699 bool is_read, int error)
3700{
573742a5 3701 IoOperationType optype;
c7c2ff0c 3702
573742a5
PM
3703 optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
3704 qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action,
c7c2ff0c 3705 bdrv_iostatus_is_enabled(bs),
624ff573
LC
3706 error == ENOSPC, strerror(error),
3707 &error_abort);
c7c2ff0c
LC
3708}
3709
3e1caa5f
PB
3710/* This is done by device models because, while the block layer knows
3711 * about the error, it does not know whether an operation comes from
3712 * the device or the block layer (from a job, for example).
3713 */
3714void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3715 bool is_read, int error)
3716{
3717 assert(error >= 0);
2bd3bce8 3718
a589569f 3719 if (action == BLOCK_ERROR_ACTION_STOP) {
2bd3bce8
PB
3720 /* First set the iostatus, so that "info block" returns an iostatus
3721 * that matches the events raised so far (an additional error iostatus
3722 * is fine, but not a lost one).
3723 */
3e1caa5f 3724 bdrv_iostatus_set_err(bs, error);
2bd3bce8
PB
3725
3726 /* Then raise the request to stop the VM and the event.
3727 * qemu_system_vmstop_request_prepare has two effects. First,
3728 * it ensures that the STOP event always comes after the
3729 * BLOCK_IO_ERROR event. Second, it ensures that even if management
3730 * can observe the STOP event and do a "cont" before the STOP
3731 * event is issued, the VM will not stop. In this case, vm_start()
3732 * also ensures that the STOP/RESUME pair of events is emitted.
3733 */
3734 qemu_system_vmstop_request_prepare();
c7c2ff0c 3735 send_qmp_error_event(bs, action, is_read, error);
2bd3bce8
PB
3736 qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3737 } else {
c7c2ff0c 3738 send_qmp_error_event(bs, action, is_read, error);
3e1caa5f
PB
3739 }
3740}
3741
b338082b
FB
3742int bdrv_is_read_only(BlockDriverState *bs)
3743{
3744 return bs->read_only;
3745}
3746
985a03b0
TS
3747int bdrv_is_sg(BlockDriverState *bs)
3748{
3749 return bs->sg;
3750}
3751
e900a7b7
CH
3752int bdrv_enable_write_cache(BlockDriverState *bs)
3753{
3754 return bs->enable_write_cache;
3755}
3756
425b0148
PB
3757void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3758{
3759 bs->enable_write_cache = wce;
55b110f2
JC
3760
3761 /* so a reopen() will preserve wce */
3762 if (wce) {
3763 bs->open_flags |= BDRV_O_CACHE_WB;
3764 } else {
3765 bs->open_flags &= ~BDRV_O_CACHE_WB;
3766 }
425b0148
PB
3767}
3768
ea2384d3
FB
3769int bdrv_is_encrypted(BlockDriverState *bs)
3770{
3771 if (bs->backing_hd && bs->backing_hd->encrypted)
3772 return 1;
3773 return bs->encrypted;
3774}
3775
c0f4ce77
AL
3776int bdrv_key_required(BlockDriverState *bs)
3777{
3778 BlockDriverState *backing_hd = bs->backing_hd;
3779
3780 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3781 return 1;
3782 return (bs->encrypted && !bs->valid_key);
3783}
3784
ea2384d3
FB
3785int bdrv_set_key(BlockDriverState *bs, const char *key)
3786{
3787 int ret;
3788 if (bs->backing_hd && bs->backing_hd->encrypted) {
3789 ret = bdrv_set_key(bs->backing_hd, key);
3790 if (ret < 0)
3791 return ret;
3792 if (!bs->encrypted)
3793 return 0;
3794 }
fd04a2ae
SH
3795 if (!bs->encrypted) {
3796 return -EINVAL;
3797 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3798 return -ENOMEDIUM;
3799 }
c0f4ce77 3800 ret = bs->drv->bdrv_set_key(bs, key);
bb5fc20f
AL
3801 if (ret < 0) {
3802 bs->valid_key = 0;
3803 } else if (!bs->valid_key) {
3804 bs->valid_key = 1;
a7f53e26
MA
3805 if (bs->blk) {
3806 /* call the change callback now, we skipped it on open */
3807 blk_dev_change_media_cb(bs->blk, true);
3808 }
bb5fc20f 3809 }
c0f4ce77 3810 return ret;
ea2384d3
FB
3811}
3812
4d2855a3
MA
3813/*
3814 * Provide an encryption key for @bs.
3815 * If @key is non-null:
3816 * If @bs is not encrypted, fail.
3817 * Else if the key is invalid, fail.
3818 * Else set @bs's key to @key, replacing the existing key, if any.
3819 * If @key is null:
3820 * If @bs is encrypted and still lacks a key, fail.
3821 * Else do nothing.
3822 * On failure, store an error object through @errp if non-null.
3823 */
3824void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp)
3825{
3826 if (key) {
3827 if (!bdrv_is_encrypted(bs)) {
81e5f78a
AG
3828 error_setg(errp, "Node '%s' is not encrypted",
3829 bdrv_get_device_or_node_name(bs));
4d2855a3
MA
3830 } else if (bdrv_set_key(bs, key) < 0) {
3831 error_set(errp, QERR_INVALID_PASSWORD);
3832 }
3833 } else {
3834 if (bdrv_key_required(bs)) {
b1ca6391
MA
3835 error_set(errp, ERROR_CLASS_DEVICE_ENCRYPTED,
3836 "'%s' (%s) is encrypted",
81e5f78a 3837 bdrv_get_device_or_node_name(bs),
4d2855a3
MA
3838 bdrv_get_encrypted_filename(bs));
3839 }
3840 }
3841}
3842
f8d6bba1 3843const char *bdrv_get_format_name(BlockDriverState *bs)
ea2384d3 3844{
f8d6bba1 3845 return bs->drv ? bs->drv->format_name : NULL;
ea2384d3
FB
3846}
3847
ada42401
SH
3848static int qsort_strcmp(const void *a, const void *b)
3849{
3850 return strcmp(a, b);
3851}
3852
5fafdf24 3853void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
ea2384d3
FB
3854 void *opaque)
3855{
3856 BlockDriver *drv;
e855e4fb 3857 int count = 0;
ada42401 3858 int i;
e855e4fb 3859 const char **formats = NULL;
ea2384d3 3860
8a22f02a 3861 QLIST_FOREACH(drv, &bdrv_drivers, list) {
e855e4fb
JC
3862 if (drv->format_name) {
3863 bool found = false;
3864 int i = count;
3865 while (formats && i && !found) {
3866 found = !strcmp(formats[--i], drv->format_name);
3867 }
3868
3869 if (!found) {
5839e53b 3870 formats = g_renew(const char *, formats, count + 1);
e855e4fb 3871 formats[count++] = drv->format_name;
e855e4fb
JC
3872 }
3873 }
ea2384d3 3874 }
ada42401
SH
3875
3876 qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3877
3878 for (i = 0; i < count; i++) {
3879 it(opaque, formats[i]);
3880 }
3881
e855e4fb 3882 g_free(formats);
ea2384d3
FB
3883}
3884
dc364f4c
BC
3885/* This function is to find a node in the bs graph */
3886BlockDriverState *bdrv_find_node(const char *node_name)
3887{
3888 BlockDriverState *bs;
3889
3890 assert(node_name);
3891
3892 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3893 if (!strcmp(node_name, bs->node_name)) {
3894 return bs;
3895 }
3896 }
3897 return NULL;
3898}
3899
c13163fb 3900/* Put this QMP function here so it can access the static graph_bdrv_states. */
d5a8ee60 3901BlockDeviceInfoList *bdrv_named_nodes_list(Error **errp)
c13163fb
BC
3902{
3903 BlockDeviceInfoList *list, *entry;
3904 BlockDriverState *bs;
3905
3906 list = NULL;
3907 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
d5a8ee60
AG
3908 BlockDeviceInfo *info = bdrv_block_device_info(bs, errp);
3909 if (!info) {
3910 qapi_free_BlockDeviceInfoList(list);
3911 return NULL;
3912 }
c13163fb 3913 entry = g_malloc0(sizeof(*entry));
d5a8ee60 3914 entry->value = info;
c13163fb
BC
3915 entry->next = list;
3916 list = entry;
3917 }
3918
3919 return list;
3920}
3921
12d3ba82
BC
3922BlockDriverState *bdrv_lookup_bs(const char *device,
3923 const char *node_name,
3924 Error **errp)
3925{
7f06d47e
MA
3926 BlockBackend *blk;
3927 BlockDriverState *bs;
12d3ba82 3928
12d3ba82 3929 if (device) {
7f06d47e 3930 blk = blk_by_name(device);
12d3ba82 3931
7f06d47e
MA
3932 if (blk) {
3933 return blk_bs(blk);
12d3ba82 3934 }
12d3ba82
BC
3935 }
3936
dd67fa50
BC
3937 if (node_name) {
3938 bs = bdrv_find_node(node_name);
12d3ba82 3939
dd67fa50
BC
3940 if (bs) {
3941 return bs;
3942 }
12d3ba82
BC
3943 }
3944
dd67fa50
BC
3945 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3946 device ? device : "",
3947 node_name ? node_name : "");
3948 return NULL;
12d3ba82
BC
3949}
3950
5a6684d2
JC
3951/* If 'base' is in the same chain as 'top', return true. Otherwise,
3952 * return false. If either argument is NULL, return false. */
3953bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3954{
3955 while (top && top != base) {
3956 top = top->backing_hd;
3957 }
3958
3959 return top != NULL;
3960}
3961
04df765a
FZ
3962BlockDriverState *bdrv_next_node(BlockDriverState *bs)
3963{
3964 if (!bs) {
3965 return QTAILQ_FIRST(&graph_bdrv_states);
3966 }
3967 return QTAILQ_NEXT(bs, node_list);
3968}
3969
2f399b0a
MA
3970BlockDriverState *bdrv_next(BlockDriverState *bs)
3971{
3972 if (!bs) {
3973 return QTAILQ_FIRST(&bdrv_states);
3974 }
dc364f4c 3975 return QTAILQ_NEXT(bs, device_list);
2f399b0a
MA
3976}
3977
20a9e77d
FZ
3978const char *bdrv_get_node_name(const BlockDriverState *bs)
3979{
3980 return bs->node_name;
3981}
3982
7f06d47e 3983/* TODO check what callers really want: bs->node_name or blk_name() */
bfb197e0 3984const char *bdrv_get_device_name(const BlockDriverState *bs)
ea2384d3 3985{
bfb197e0 3986 return bs->blk ? blk_name(bs->blk) : "";
ea2384d3
FB
3987}
3988
9b2aa84f
AG
3989/* This can be used to identify nodes that might not have a device
3990 * name associated. Since node and device names live in the same
3991 * namespace, the result is unambiguous. The exception is if both are
3992 * absent, then this returns an empty (non-null) string. */
3993const char *bdrv_get_device_or_node_name(const BlockDriverState *bs)
3994{
3995 return bs->blk ? blk_name(bs->blk) : bs->node_name;
3996}
3997
c8433287
MA
3998int bdrv_get_flags(BlockDriverState *bs)
3999{
4000 return bs->open_flags;
4001}
4002
f0f0fdfe 4003int bdrv_flush_all(void)
c6ca28d6
AL
4004{
4005 BlockDriverState *bs;
f0f0fdfe 4006 int result = 0;
c6ca28d6 4007
dc364f4c 4008 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
4009 AioContext *aio_context = bdrv_get_aio_context(bs);
4010 int ret;
4011
4012 aio_context_acquire(aio_context);
4013 ret = bdrv_flush(bs);
f0f0fdfe
KW
4014 if (ret < 0 && !result) {
4015 result = ret;
4016 }
ed78cda3 4017 aio_context_release(aio_context);
1b7bdbc1 4018 }
f0f0fdfe
KW
4019
4020 return result;
c6ca28d6
AL
4021}
4022
3ac21627
PL
4023int bdrv_has_zero_init_1(BlockDriverState *bs)
4024{
4025 return 1;
4026}
4027
f2feebbd
KW
4028int bdrv_has_zero_init(BlockDriverState *bs)
4029{
4030 assert(bs->drv);
4031
11212d8f
PB
4032 /* If BS is a copy on write image, it is initialized to
4033 the contents of the base image, which may not be zeroes. */
4034 if (bs->backing_hd) {
4035 return 0;
4036 }
336c1c12
KW
4037 if (bs->drv->bdrv_has_zero_init) {
4038 return bs->drv->bdrv_has_zero_init(bs);
f2feebbd
KW
4039 }
4040
3ac21627
PL
4041 /* safe default */
4042 return 0;
f2feebbd
KW
4043}
4044
4ce78691
PL
4045bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
4046{
4047 BlockDriverInfo bdi;
4048
4049 if (bs->backing_hd) {
4050 return false;
4051 }
4052
4053 if (bdrv_get_info(bs, &bdi) == 0) {
4054 return bdi.unallocated_blocks_are_zero;
4055 }
4056
4057 return false;
4058}
4059
4060bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
4061{
4062 BlockDriverInfo bdi;
4063
4064 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
4065 return false;
4066 }
4067
4068 if (bdrv_get_info(bs, &bdi) == 0) {
4069 return bdi.can_write_zeroes_with_unmap;
4070 }
4071
4072 return false;
4073}
4074
b6b8a333 4075typedef struct BdrvCoGetBlockStatusData {
376ae3f1 4076 BlockDriverState *bs;
b35b2bba 4077 BlockDriverState *base;
376ae3f1
SH
4078 int64_t sector_num;
4079 int nb_sectors;
4080 int *pnum;
b6b8a333 4081 int64_t ret;
376ae3f1 4082 bool done;
b6b8a333 4083} BdrvCoGetBlockStatusData;
376ae3f1 4084
f58c7b35 4085/*
705be728
FZ
4086 * Returns the allocation status of the specified sectors.
4087 * Drivers not implementing the functionality are assumed to not support
4088 * backing files, hence all their sectors are reported as allocated.
f58c7b35 4089 *
bd9533e3
SH
4090 * If 'sector_num' is beyond the end of the disk image the return value is 0
4091 * and 'pnum' is set to 0.
4092 *
f58c7b35
TS
4093 * 'pnum' is set to the number of sectors (including and immediately following
4094 * the specified sector) that are known to be in the same
4095 * allocated/unallocated state.
4096 *
bd9533e3
SH
4097 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
4098 * beyond the end of the disk image it will be clamped.
f58c7b35 4099 */
b6b8a333
PB
4100static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
4101 int64_t sector_num,
4102 int nb_sectors, int *pnum)
f58c7b35 4103{
30a7f2fc 4104 int64_t total_sectors;
bd9533e3 4105 int64_t n;
5daa74a6 4106 int64_t ret, ret2;
bd9533e3 4107
30a7f2fc
MA
4108 total_sectors = bdrv_nb_sectors(bs);
4109 if (total_sectors < 0) {
4110 return total_sectors;
617ccb46
PB
4111 }
4112
30a7f2fc 4113 if (sector_num >= total_sectors) {
bd9533e3
SH
4114 *pnum = 0;
4115 return 0;
4116 }
4117
30a7f2fc 4118 n = total_sectors - sector_num;
bd9533e3
SH
4119 if (n < nb_sectors) {
4120 nb_sectors = n;
4121 }
4122
b6b8a333 4123 if (!bs->drv->bdrv_co_get_block_status) {
bd9533e3 4124 *pnum = nb_sectors;
e88ae226 4125 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
918e92d7
PB
4126 if (bs->drv->protocol_name) {
4127 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
4128 }
4129 return ret;
f58c7b35 4130 }
6aebab14 4131
415b5b01
PB
4132 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
4133 if (ret < 0) {
3e0a233d 4134 *pnum = 0;
415b5b01
PB
4135 return ret;
4136 }
4137
92bc50a5
PL
4138 if (ret & BDRV_BLOCK_RAW) {
4139 assert(ret & BDRV_BLOCK_OFFSET_VALID);
4140 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4141 *pnum, pnum);
4142 }
4143
e88ae226
KW
4144 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
4145 ret |= BDRV_BLOCK_ALLOCATED;
4146 }
4147
c3d86884
PL
4148 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
4149 if (bdrv_unallocated_blocks_are_zero(bs)) {
f0ad5712 4150 ret |= BDRV_BLOCK_ZERO;
1f9db224 4151 } else if (bs->backing_hd) {
f0ad5712 4152 BlockDriverState *bs2 = bs->backing_hd;
30a7f2fc
MA
4153 int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
4154 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
f0ad5712
PB
4155 ret |= BDRV_BLOCK_ZERO;
4156 }
4157 }
415b5b01 4158 }
5daa74a6
PB
4159
4160 if (bs->file &&
4161 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
4162 (ret & BDRV_BLOCK_OFFSET_VALID)) {
59c9a95f
HR
4163 int file_pnum;
4164
5daa74a6 4165 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
59c9a95f 4166 *pnum, &file_pnum);
5daa74a6
PB
4167 if (ret2 >= 0) {
4168 /* Ignore errors. This is just providing extra information, it
4169 * is useful but not necessary.
4170 */
59c9a95f
HR
4171 if (!file_pnum) {
4172 /* !file_pnum indicates an offset at or beyond the EOF; it is
4173 * perfectly valid for the format block driver to point to such
4174 * offsets, so catch it and mark everything as zero */
4175 ret |= BDRV_BLOCK_ZERO;
4176 } else {
4177 /* Limit request to the range reported by the protocol driver */
4178 *pnum = file_pnum;
4179 ret |= (ret2 & BDRV_BLOCK_ZERO);
4180 }
5daa74a6
PB
4181 }
4182 }
4183
415b5b01 4184 return ret;
060f51c9
SH
4185}
4186
b6b8a333
PB
4187/* Coroutine wrapper for bdrv_get_block_status() */
4188static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
060f51c9 4189{
b6b8a333 4190 BdrvCoGetBlockStatusData *data = opaque;
060f51c9
SH
4191 BlockDriverState *bs = data->bs;
4192
b6b8a333
PB
4193 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4194 data->pnum);
060f51c9
SH
4195 data->done = true;
4196}
4197
4198/*
b6b8a333 4199 * Synchronous wrapper around bdrv_co_get_block_status().
060f51c9 4200 *
b6b8a333 4201 * See bdrv_co_get_block_status() for details.
060f51c9 4202 */
b6b8a333
PB
4203int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4204 int nb_sectors, int *pnum)
060f51c9 4205{
6aebab14 4206 Coroutine *co;
b6b8a333 4207 BdrvCoGetBlockStatusData data = {
6aebab14
SH
4208 .bs = bs,
4209 .sector_num = sector_num,
4210 .nb_sectors = nb_sectors,
4211 .pnum = pnum,
4212 .done = false,
4213 };
4214
bdad13b9
PB
4215 if (qemu_in_coroutine()) {
4216 /* Fast-path if already in coroutine context */
b6b8a333 4217 bdrv_get_block_status_co_entry(&data);
bdad13b9 4218 } else {
2572b37a
SH
4219 AioContext *aio_context = bdrv_get_aio_context(bs);
4220
b6b8a333 4221 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
bdad13b9
PB
4222 qemu_coroutine_enter(co, &data);
4223 while (!data.done) {
2572b37a 4224 aio_poll(aio_context, true);
bdad13b9 4225 }
6aebab14
SH
4226 }
4227 return data.ret;
f58c7b35
TS
4228}
4229
b6b8a333
PB
4230int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4231 int nb_sectors, int *pnum)
4232{
4333bb71
PB
4233 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4234 if (ret < 0) {
4235 return ret;
4236 }
01fb2705 4237 return !!(ret & BDRV_BLOCK_ALLOCATED);
b6b8a333
PB
4238}
4239
188a7bbf
PB
4240/*
4241 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4242 *
4243 * Return true if the given sector is allocated in any image between
4244 * BASE and TOP (inclusive). BASE can be NULL to check if the given
4245 * sector is allocated in any image of the chain. Return false otherwise.
4246 *
4247 * 'pnum' is set to the number of sectors (including and immediately following
4248 * the specified sector) that are known to be in the same
4249 * allocated/unallocated state.
4250 *
4251 */
4f578637
PB
4252int bdrv_is_allocated_above(BlockDriverState *top,
4253 BlockDriverState *base,
4254 int64_t sector_num,
4255 int nb_sectors, int *pnum)
188a7bbf
PB
4256{
4257 BlockDriverState *intermediate;
4258 int ret, n = nb_sectors;
4259
4260 intermediate = top;
4261 while (intermediate && intermediate != base) {
4262 int pnum_inter;
bdad13b9
PB
4263 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4264 &pnum_inter);
188a7bbf
PB
4265 if (ret < 0) {
4266 return ret;
4267 } else if (ret) {
4268 *pnum = pnum_inter;
4269 return 1;
4270 }
4271
4272 /*
4273 * [sector_num, nb_sectors] is unallocated on top but intermediate
4274 * might have
4275 *
4276 * [sector_num+x, nr_sectors] allocated.
4277 */
63ba17d3
VI
4278 if (n > pnum_inter &&
4279 (intermediate == top ||
4280 sector_num + pnum_inter < intermediate->total_sectors)) {
188a7bbf
PB
4281 n = pnum_inter;
4282 }
4283
4284 intermediate = intermediate->backing_hd;
4285 }
4286
4287 *pnum = n;
4288 return 0;
4289}
4290
045df330
AL
4291const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4292{
4293 if (bs->backing_hd && bs->backing_hd->encrypted)
4294 return bs->backing_file;
4295 else if (bs->encrypted)
4296 return bs->filename;
4297 else
4298 return NULL;
4299}
4300
5fafdf24 4301void bdrv_get_backing_filename(BlockDriverState *bs,
83f64091
FB
4302 char *filename, int filename_size)
4303{
3574c608 4304 pstrcpy(filename, filename_size, bs->backing_file);
83f64091
FB
4305}
4306
5fafdf24 4307int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
faea38e7
FB
4308 const uint8_t *buf, int nb_sectors)
4309{
4310 BlockDriver *drv = bs->drv;
b9c64947
HR
4311 int ret;
4312
4313 if (!drv) {
19cb3738 4314 return -ENOMEDIUM;
b9c64947
HR
4315 }
4316 if (!drv->bdrv_write_compressed) {
faea38e7 4317 return -ENOTSUP;
b9c64947
HR
4318 }
4319 ret = bdrv_check_request(bs, sector_num, nb_sectors);
4320 if (ret < 0) {
4321 return ret;
4322 }
a55eb92c 4323
e4654d2d 4324 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
a55eb92c 4325
faea38e7
FB
4326 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4327}
3b46e624 4328
faea38e7
FB
4329int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4330{
4331 BlockDriver *drv = bs->drv;
4332 if (!drv)
19cb3738 4333 return -ENOMEDIUM;
faea38e7
FB
4334 if (!drv->bdrv_get_info)
4335 return -ENOTSUP;
4336 memset(bdi, 0, sizeof(*bdi));
4337 return drv->bdrv_get_info(bs, bdi);
4338}
4339
eae041fe
HR
4340ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4341{
4342 BlockDriver *drv = bs->drv;
4343 if (drv && drv->bdrv_get_specific_info) {
4344 return drv->bdrv_get_specific_info(bs);
4345 }
4346 return NULL;
4347}
4348
45566e9c
CH
4349int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4350 int64_t pos, int size)
cf8074b3
KW
4351{
4352 QEMUIOVector qiov;
4353 struct iovec iov = {
4354 .iov_base = (void *) buf,
4355 .iov_len = size,
4356 };
4357
4358 qemu_iovec_init_external(&qiov, &iov, 1);
4359 return bdrv_writev_vmstate(bs, &qiov, pos);
4360}
4361
4362int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
178e08a5
AL
4363{
4364 BlockDriver *drv = bs->drv;
cf8074b3
KW
4365
4366 if (!drv) {
178e08a5 4367 return -ENOMEDIUM;
cf8074b3
KW
4368 } else if (drv->bdrv_save_vmstate) {
4369 return drv->bdrv_save_vmstate(bs, qiov, pos);
4370 } else if (bs->file) {
4371 return bdrv_writev_vmstate(bs->file, qiov, pos);
4372 }
4373
7cdb1f6d 4374 return -ENOTSUP;
178e08a5
AL
4375}
4376
45566e9c
CH
4377int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4378 int64_t pos, int size)
178e08a5
AL
4379{
4380 BlockDriver *drv = bs->drv;
4381 if (!drv)
4382 return -ENOMEDIUM;
7cdb1f6d
MK
4383 if (drv->bdrv_load_vmstate)
4384 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4385 if (bs->file)
4386 return bdrv_load_vmstate(bs->file, buf, pos, size);
4387 return -ENOTSUP;
178e08a5
AL
4388}
4389
8b9b0cc2
KW
4390void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4391{
bf736fe3 4392 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
8b9b0cc2
KW
4393 return;
4394 }
4395
bf736fe3 4396 bs->drv->bdrv_debug_event(bs, event);
41c695c7
KW
4397}
4398
4399int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4400 const char *tag)
4401{
4402 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4403 bs = bs->file;
4404 }
4405
4406 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4407 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4408 }
4409
4410 return -ENOTSUP;
4411}
4412
4cc70e93
FZ
4413int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4414{
4415 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4416 bs = bs->file;
4417 }
4418
4419 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4420 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4421 }
4422
4423 return -ENOTSUP;
4424}
4425
41c695c7
KW
4426int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4427{
938789ea 4428 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
41c695c7
KW
4429 bs = bs->file;
4430 }
8b9b0cc2 4431
41c695c7
KW
4432 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4433 return bs->drv->bdrv_debug_resume(bs, tag);
4434 }
4435
4436 return -ENOTSUP;
4437}
4438
4439bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4440{
4441 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4442 bs = bs->file;
4443 }
4444
4445 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4446 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4447 }
4448
4449 return false;
8b9b0cc2
KW
4450}
4451
199630b6
BS
4452int bdrv_is_snapshot(BlockDriverState *bs)
4453{
4454 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4455}
4456
b1b1d783
JC
4457/* backing_file can either be relative, or absolute, or a protocol. If it is
4458 * relative, it must be relative to the chain. So, passing in bs->filename
4459 * from a BDS as backing_file should not be done, as that may be relative to
4460 * the CWD rather than the chain. */
e8a6bb9c
MT
4461BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4462 const char *backing_file)
4463{
b1b1d783
JC
4464 char *filename_full = NULL;
4465 char *backing_file_full = NULL;
4466 char *filename_tmp = NULL;
4467 int is_protocol = 0;
4468 BlockDriverState *curr_bs = NULL;
4469 BlockDriverState *retval = NULL;
4470
4471 if (!bs || !bs->drv || !backing_file) {
e8a6bb9c
MT
4472 return NULL;
4473 }
4474
b1b1d783
JC
4475 filename_full = g_malloc(PATH_MAX);
4476 backing_file_full = g_malloc(PATH_MAX);
4477 filename_tmp = g_malloc(PATH_MAX);
4478
4479 is_protocol = path_has_protocol(backing_file);
4480
4481 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4482
4483 /* If either of the filename paths is actually a protocol, then
4484 * compare unmodified paths; otherwise make paths relative */
4485 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4486 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4487 retval = curr_bs->backing_hd;
4488 break;
4489 }
e8a6bb9c 4490 } else {
b1b1d783
JC
4491 /* If not an absolute filename path, make it relative to the current
4492 * image's filename path */
4493 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4494 backing_file);
4495
4496 /* We are going to compare absolute pathnames */
4497 if (!realpath(filename_tmp, filename_full)) {
4498 continue;
4499 }
4500
4501 /* We need to make sure the backing filename we are comparing against
4502 * is relative to the current image filename (or absolute) */
4503 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4504 curr_bs->backing_file);
4505
4506 if (!realpath(filename_tmp, backing_file_full)) {
4507 continue;
4508 }
4509
4510 if (strcmp(backing_file_full, filename_full) == 0) {
4511 retval = curr_bs->backing_hd;
4512 break;
4513 }
e8a6bb9c
MT
4514 }
4515 }
4516
b1b1d783
JC
4517 g_free(filename_full);
4518 g_free(backing_file_full);
4519 g_free(filename_tmp);
4520 return retval;
e8a6bb9c
MT
4521}
4522
f198fd1c
BC
4523int bdrv_get_backing_file_depth(BlockDriverState *bs)
4524{
4525 if (!bs->drv) {
4526 return 0;
4527 }
4528
4529 if (!bs->backing_hd) {
4530 return 0;
4531 }
4532
4533 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4534}
4535
ea2384d3 4536/**************************************************************/
83f64091 4537/* async I/Os */
ea2384d3 4538
7c84b1b8
MA
4539BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4540 QEMUIOVector *qiov, int nb_sectors,
097310b5 4541 BlockCompletionFunc *cb, void *opaque)
83f64091 4542{
bbf0a440
SH
4543 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4544
d20d9b7c 4545 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
8c5873d6 4546 cb, opaque, false);
ea2384d3
FB
4547}
4548
7c84b1b8
MA
4549BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4550 QEMUIOVector *qiov, int nb_sectors,
097310b5 4551 BlockCompletionFunc *cb, void *opaque)
ea2384d3 4552{
bbf0a440
SH
4553 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4554
d20d9b7c 4555 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
8c5873d6 4556 cb, opaque, true);
83f64091
FB
4557}
4558
7c84b1b8 4559BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
d5ef94d4 4560 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
097310b5 4561 BlockCompletionFunc *cb, void *opaque)
d5ef94d4
PB
4562{
4563 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4564
4565 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4566 BDRV_REQ_ZERO_WRITE | flags,
4567 cb, opaque, true);
4568}
4569
40b4f539
KW
4570
4571typedef struct MultiwriteCB {
4572 int error;
4573 int num_requests;
4574 int num_callbacks;
4575 struct {
097310b5 4576 BlockCompletionFunc *cb;
40b4f539
KW
4577 void *opaque;
4578 QEMUIOVector *free_qiov;
40b4f539
KW
4579 } callbacks[];
4580} MultiwriteCB;
4581
4582static void multiwrite_user_cb(MultiwriteCB *mcb)
4583{
4584 int i;
4585
4586 for (i = 0; i < mcb->num_callbacks; i++) {
4587 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
1e1ea48d
SH
4588 if (mcb->callbacks[i].free_qiov) {
4589 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4590 }
7267c094 4591 g_free(mcb->callbacks[i].free_qiov);
40b4f539
KW
4592 }
4593}
4594
4595static void multiwrite_cb(void *opaque, int ret)
4596{
4597 MultiwriteCB *mcb = opaque;
4598
6d519a5f
SH
4599 trace_multiwrite_cb(mcb, ret);
4600
cb6d3ca0 4601 if (ret < 0 && !mcb->error) {
40b4f539 4602 mcb->error = ret;
40b4f539
KW
4603 }
4604
4605 mcb->num_requests--;
4606 if (mcb->num_requests == 0) {
de189a1b 4607 multiwrite_user_cb(mcb);
7267c094 4608 g_free(mcb);
40b4f539
KW
4609 }
4610}
4611
4612static int multiwrite_req_compare(const void *a, const void *b)
4613{
77be4366
CH
4614 const BlockRequest *req1 = a, *req2 = b;
4615
4616 /*
4617 * Note that we can't simply subtract req2->sector from req1->sector
4618 * here as that could overflow the return value.
4619 */
4620 if (req1->sector > req2->sector) {
4621 return 1;
4622 } else if (req1->sector < req2->sector) {
4623 return -1;
4624 } else {
4625 return 0;
4626 }
40b4f539
KW
4627}
4628
4629/*
4630 * Takes a bunch of requests and tries to merge them. Returns the number of
4631 * requests that remain after merging.
4632 */
4633static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4634 int num_reqs, MultiwriteCB *mcb)
4635{
4636 int i, outidx;
4637
4638 // Sort requests by start sector
4639 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4640
4641 // Check if adjacent requests touch the same clusters. If so, combine them,
4642 // filling up gaps with zero sectors.
4643 outidx = 0;
4644 for (i = 1; i < num_reqs; i++) {
4645 int merge = 0;
4646 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4647
b6a127a1 4648 // Handle exactly sequential writes and overlapping writes.
40b4f539
KW
4649 if (reqs[i].sector <= oldreq_last) {
4650 merge = 1;
4651 }
4652
e2a305fb
CH
4653 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4654 merge = 0;
4655 }
4656
6c5a42ac
PL
4657 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
4658 reqs[i].nb_sectors > bs->bl.max_transfer_length) {
4659 merge = 0;
4660 }
4661
40b4f539
KW
4662 if (merge) {
4663 size_t size;
7267c094 4664 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
40b4f539
KW
4665 qemu_iovec_init(qiov,
4666 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4667
4668 // Add the first request to the merged one. If the requests are
4669 // overlapping, drop the last sectors of the first request.
4670 size = (reqs[i].sector - reqs[outidx].sector) << 9;
1b093c48 4671 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
40b4f539 4672
b6a127a1
PB
4673 // We should need to add any zeros between the two requests
4674 assert (reqs[i].sector <= oldreq_last);
40b4f539
KW
4675
4676 // Add the second request
1b093c48 4677 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
40b4f539 4678
391827eb
SH
4679 // Add tail of first request, if necessary
4680 if (qiov->size < reqs[outidx].qiov->size) {
4681 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4682 reqs[outidx].qiov->size - qiov->size);
4683 }
4684
cbf1dff2 4685 reqs[outidx].nb_sectors = qiov->size >> 9;
40b4f539
KW
4686 reqs[outidx].qiov = qiov;
4687
4688 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4689 } else {
4690 outidx++;
4691 reqs[outidx].sector = reqs[i].sector;
4692 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4693 reqs[outidx].qiov = reqs[i].qiov;
4694 }
4695 }
4696
f4564d53
PL
4697 block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1);
4698
40b4f539
KW
4699 return outidx + 1;
4700}
4701
4702/*
4703 * Submit multiple AIO write requests at once.
4704 *
4705 * On success, the function returns 0 and all requests in the reqs array have
4706 * been submitted. In error case this function returns -1, and any of the
4707 * requests may or may not be submitted yet. In particular, this means that the
4708 * callback will be called for some of the requests, for others it won't. The
4709 * caller must check the error field of the BlockRequest to wait for the right
4710 * callbacks (if error != 0, no callback will be called).
4711 *
4712 * The implementation may modify the contents of the reqs array, e.g. to merge
4713 * requests. However, the fields opaque and error are left unmodified as they
4714 * are used to signal failure for a single request to the caller.
4715 */
4716int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4717{
40b4f539
KW
4718 MultiwriteCB *mcb;
4719 int i;
4720
301db7c2
RH
4721 /* don't submit writes if we don't have a medium */
4722 if (bs->drv == NULL) {
4723 for (i = 0; i < num_reqs; i++) {
4724 reqs[i].error = -ENOMEDIUM;
4725 }
4726 return -1;
4727 }
4728
40b4f539
KW
4729 if (num_reqs == 0) {
4730 return 0;
4731 }
4732
4733 // Create MultiwriteCB structure
7267c094 4734 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
40b4f539
KW
4735 mcb->num_requests = 0;
4736 mcb->num_callbacks = num_reqs;
4737
4738 for (i = 0; i < num_reqs; i++) {
4739 mcb->callbacks[i].cb = reqs[i].cb;
4740 mcb->callbacks[i].opaque = reqs[i].opaque;
4741 }
4742
4743 // Check for mergable requests
4744 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4745
6d519a5f
SH
4746 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4747
df9309fb
PB
4748 /* Run the aio requests. */
4749 mcb->num_requests = num_reqs;
40b4f539 4750 for (i = 0; i < num_reqs; i++) {
d20d9b7c
PB
4751 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4752 reqs[i].nb_sectors, reqs[i].flags,
4753 multiwrite_cb, mcb,
4754 true);
40b4f539
KW
4755 }
4756
4757 return 0;
40b4f539
KW
4758}
4759
7c84b1b8 4760void bdrv_aio_cancel(BlockAIOCB *acb)
83f64091 4761{
ca5fd113
FZ
4762 qemu_aio_ref(acb);
4763 bdrv_aio_cancel_async(acb);
4764 while (acb->refcnt > 1) {
4765 if (acb->aiocb_info->get_aio_context) {
4766 aio_poll(acb->aiocb_info->get_aio_context(acb), true);
4767 } else if (acb->bs) {
4768 aio_poll(bdrv_get_aio_context(acb->bs), true);
4769 } else {
4770 abort();
02c50efe 4771 }
02c50efe 4772 }
8007429a 4773 qemu_aio_unref(acb);
02c50efe
FZ
4774}
4775
4776/* Async version of aio cancel. The caller is not blocked if the acb implements
4777 * cancel_async, otherwise we do nothing and let the request normally complete.
4778 * In either case the completion callback must be called. */
7c84b1b8 4779void bdrv_aio_cancel_async(BlockAIOCB *acb)
02c50efe
FZ
4780{
4781 if (acb->aiocb_info->cancel_async) {
4782 acb->aiocb_info->cancel_async(acb);
4783 }
83f64091
FB
4784}
4785
4786/**************************************************************/
4787/* async block device emulation */
4788
7c84b1b8
MA
4789typedef struct BlockAIOCBSync {
4790 BlockAIOCB common;
c16b5a2c
CH
4791 QEMUBH *bh;
4792 int ret;
4793 /* vector translation state */
4794 QEMUIOVector *qiov;
4795 uint8_t *bounce;
4796 int is_write;
7c84b1b8 4797} BlockAIOCBSync;
c16b5a2c 4798
d7331bed 4799static const AIOCBInfo bdrv_em_aiocb_info = {
7c84b1b8 4800 .aiocb_size = sizeof(BlockAIOCBSync),
c16b5a2c
CH
4801};
4802
ce1a14dc 4803static void bdrv_aio_bh_cb(void *opaque)
83f64091 4804{
7c84b1b8 4805 BlockAIOCBSync *acb = opaque;
f141eafe 4806
857d4f46 4807 if (!acb->is_write && acb->ret >= 0) {
03396148 4808 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
857d4f46 4809 }
ceb42de8 4810 qemu_vfree(acb->bounce);
ce1a14dc 4811 acb->common.cb(acb->common.opaque, acb->ret);
6a7ad299 4812 qemu_bh_delete(acb->bh);
36afc451 4813 acb->bh = NULL;
8007429a 4814 qemu_aio_unref(acb);
83f64091 4815}
beac80cd 4816
7c84b1b8
MA
4817static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4818 int64_t sector_num,
4819 QEMUIOVector *qiov,
4820 int nb_sectors,
097310b5 4821 BlockCompletionFunc *cb,
7c84b1b8
MA
4822 void *opaque,
4823 int is_write)
f141eafe 4824
83f64091 4825{
7c84b1b8 4826 BlockAIOCBSync *acb;
ce1a14dc 4827
d7331bed 4828 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
f141eafe
AL
4829 acb->is_write = is_write;
4830 acb->qiov = qiov;
857d4f46 4831 acb->bounce = qemu_try_blockalign(bs, qiov->size);
2572b37a 4832 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
f141eafe 4833
857d4f46
KW
4834 if (acb->bounce == NULL) {
4835 acb->ret = -ENOMEM;
4836 } else if (is_write) {
d5e6b161 4837 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
1ed20acf 4838 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
f141eafe 4839 } else {
1ed20acf 4840 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
f141eafe
AL
4841 }
4842
ce1a14dc 4843 qemu_bh_schedule(acb->bh);
f141eafe 4844
ce1a14dc 4845 return &acb->common;
beac80cd
FB
4846}
4847
7c84b1b8 4848static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
f141eafe 4849 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
097310b5 4850 BlockCompletionFunc *cb, void *opaque)
beac80cd 4851{
f141eafe
AL
4852 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4853}
83f64091 4854
7c84b1b8 4855static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
f141eafe 4856 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
097310b5 4857 BlockCompletionFunc *cb, void *opaque)
f141eafe
AL
4858{
4859 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
beac80cd 4860}
beac80cd 4861
68485420 4862
7c84b1b8
MA
4863typedef struct BlockAIOCBCoroutine {
4864 BlockAIOCB common;
68485420
KW
4865 BlockRequest req;
4866 bool is_write;
0b5a2445 4867 bool need_bh;
d318aea9 4868 bool *done;
68485420 4869 QEMUBH* bh;
7c84b1b8 4870} BlockAIOCBCoroutine;
68485420 4871
d7331bed 4872static const AIOCBInfo bdrv_em_co_aiocb_info = {
7c84b1b8 4873 .aiocb_size = sizeof(BlockAIOCBCoroutine),
68485420
KW
4874};
4875
0b5a2445
PB
4876static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
4877{
4878 if (!acb->need_bh) {
4879 acb->common.cb(acb->common.opaque, acb->req.error);
4880 qemu_aio_unref(acb);
4881 }
4882}
4883
35246a68 4884static void bdrv_co_em_bh(void *opaque)
68485420 4885{
7c84b1b8 4886 BlockAIOCBCoroutine *acb = opaque;
68485420 4887
0b5a2445 4888 assert(!acb->need_bh);
68485420 4889 qemu_bh_delete(acb->bh);
0b5a2445
PB
4890 bdrv_co_complete(acb);
4891}
4892
4893static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
4894{
4895 acb->need_bh = false;
4896 if (acb->req.error != -EINPROGRESS) {
4897 BlockDriverState *bs = acb->common.bs;
4898
4899 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4900 qemu_bh_schedule(acb->bh);
4901 }
68485420
KW
4902}
4903
b2a61371
SH
4904/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4905static void coroutine_fn bdrv_co_do_rw(void *opaque)
4906{
7c84b1b8 4907 BlockAIOCBCoroutine *acb = opaque;
b2a61371
SH
4908 BlockDriverState *bs = acb->common.bs;
4909
4910 if (!acb->is_write) {
4911 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
d20d9b7c 4912 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
b2a61371
SH
4913 } else {
4914 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
d20d9b7c 4915 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
b2a61371
SH
4916 }
4917
0b5a2445 4918 bdrv_co_complete(acb);
b2a61371
SH
4919}
4920
7c84b1b8
MA
4921static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4922 int64_t sector_num,
4923 QEMUIOVector *qiov,
4924 int nb_sectors,
4925 BdrvRequestFlags flags,
097310b5 4926 BlockCompletionFunc *cb,
7c84b1b8
MA
4927 void *opaque,
4928 bool is_write)
68485420
KW
4929{
4930 Coroutine *co;
7c84b1b8 4931 BlockAIOCBCoroutine *acb;
68485420 4932
d7331bed 4933 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
0b5a2445
PB
4934 acb->need_bh = true;
4935 acb->req.error = -EINPROGRESS;
68485420
KW
4936 acb->req.sector = sector_num;
4937 acb->req.nb_sectors = nb_sectors;
4938 acb->req.qiov = qiov;
d20d9b7c 4939 acb->req.flags = flags;
68485420
KW
4940 acb->is_write = is_write;
4941
8c5873d6 4942 co = qemu_coroutine_create(bdrv_co_do_rw);
68485420
KW
4943 qemu_coroutine_enter(co, acb);
4944
0b5a2445 4945 bdrv_co_maybe_schedule_bh(acb);
68485420
KW
4946 return &acb->common;
4947}
4948
07f07615 4949static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
b2e12bc6 4950{
7c84b1b8 4951 BlockAIOCBCoroutine *acb = opaque;
07f07615 4952 BlockDriverState *bs = acb->common.bs;
b2e12bc6 4953
07f07615 4954 acb->req.error = bdrv_co_flush(bs);
0b5a2445 4955 bdrv_co_complete(acb);
b2e12bc6
CH
4956}
4957
7c84b1b8 4958BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
097310b5 4959 BlockCompletionFunc *cb, void *opaque)
016f5cf6 4960{
07f07615 4961 trace_bdrv_aio_flush(bs, opaque);
016f5cf6 4962
07f07615 4963 Coroutine *co;
7c84b1b8 4964 BlockAIOCBCoroutine *acb;
016f5cf6 4965
d7331bed 4966 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
0b5a2445
PB
4967 acb->need_bh = true;
4968 acb->req.error = -EINPROGRESS;
d318aea9 4969
07f07615
PB
4970 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4971 qemu_coroutine_enter(co, acb);
016f5cf6 4972
0b5a2445 4973 bdrv_co_maybe_schedule_bh(acb);
016f5cf6
AG
4974 return &acb->common;
4975}
4976
4265d620
PB
4977static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4978{
7c84b1b8 4979 BlockAIOCBCoroutine *acb = opaque;
4265d620
PB
4980 BlockDriverState *bs = acb->common.bs;
4981
4982 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
0b5a2445 4983 bdrv_co_complete(acb);
4265d620
PB
4984}
4985
7c84b1b8 4986BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4265d620 4987 int64_t sector_num, int nb_sectors,
097310b5 4988 BlockCompletionFunc *cb, void *opaque)
4265d620
PB
4989{
4990 Coroutine *co;
7c84b1b8 4991 BlockAIOCBCoroutine *acb;
4265d620
PB
4992
4993 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4994
d7331bed 4995 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
0b5a2445
PB
4996 acb->need_bh = true;
4997 acb->req.error = -EINPROGRESS;
4265d620
PB
4998 acb->req.sector = sector_num;
4999 acb->req.nb_sectors = nb_sectors;
5000 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
5001 qemu_coroutine_enter(co, acb);
5002
0b5a2445 5003 bdrv_co_maybe_schedule_bh(acb);
4265d620
PB
5004 return &acb->common;
5005}
5006
ea2384d3
FB
5007void bdrv_init(void)
5008{
5efa9d5a 5009 module_call_init(MODULE_INIT_BLOCK);
ea2384d3 5010}
ce1a14dc 5011
eb852011
MA
5012void bdrv_init_with_whitelist(void)
5013{
5014 use_bdrv_whitelist = 1;
5015 bdrv_init();
5016}
5017
d7331bed 5018void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
097310b5 5019 BlockCompletionFunc *cb, void *opaque)
ce1a14dc 5020{
7c84b1b8 5021 BlockAIOCB *acb;
ce1a14dc 5022
d7331bed
SH
5023 acb = g_slice_alloc(aiocb_info->aiocb_size);
5024 acb->aiocb_info = aiocb_info;
ce1a14dc
PB
5025 acb->bs = bs;
5026 acb->cb = cb;
5027 acb->opaque = opaque;
f197fe2b 5028 acb->refcnt = 1;
ce1a14dc
PB
5029 return acb;
5030}
5031
f197fe2b
FZ
5032void qemu_aio_ref(void *p)
5033{
7c84b1b8 5034 BlockAIOCB *acb = p;
f197fe2b
FZ
5035 acb->refcnt++;
5036}
5037
8007429a 5038void qemu_aio_unref(void *p)
ce1a14dc 5039{
7c84b1b8 5040 BlockAIOCB *acb = p;
f197fe2b
FZ
5041 assert(acb->refcnt > 0);
5042 if (--acb->refcnt == 0) {
5043 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
5044 }
ce1a14dc 5045}
19cb3738 5046
f9f05dc5
KW
5047/**************************************************************/
5048/* Coroutine block device emulation */
5049
5050typedef struct CoroutineIOCompletion {
5051 Coroutine *coroutine;
5052 int ret;
5053} CoroutineIOCompletion;
5054
5055static void bdrv_co_io_em_complete(void *opaque, int ret)
5056{
5057 CoroutineIOCompletion *co = opaque;
5058
5059 co->ret = ret;
5060 qemu_coroutine_enter(co->coroutine, NULL);
5061}
5062
5063static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
5064 int nb_sectors, QEMUIOVector *iov,
5065 bool is_write)
5066{
5067 CoroutineIOCompletion co = {
5068 .coroutine = qemu_coroutine_self(),
5069 };
7c84b1b8 5070 BlockAIOCB *acb;
f9f05dc5
KW
5071
5072 if (is_write) {
a652d160
SH
5073 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
5074 bdrv_co_io_em_complete, &co);
f9f05dc5 5075 } else {
a652d160
SH
5076 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
5077 bdrv_co_io_em_complete, &co);
f9f05dc5
KW
5078 }
5079
59370aaa 5080 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
f9f05dc5
KW
5081 if (!acb) {
5082 return -EIO;
5083 }
5084 qemu_coroutine_yield();
5085
5086 return co.ret;
5087}
5088
5089static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
5090 int64_t sector_num, int nb_sectors,
5091 QEMUIOVector *iov)
5092{
5093 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
5094}
5095
5096static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
5097 int64_t sector_num, int nb_sectors,
5098 QEMUIOVector *iov)
5099{
5100 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
5101}
5102
07f07615 5103static void coroutine_fn bdrv_flush_co_entry(void *opaque)
e7a8a783 5104{
07f07615
PB
5105 RwCo *rwco = opaque;
5106
5107 rwco->ret = bdrv_co_flush(rwco->bs);
5108}
5109
5110int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
5111{
eb489bb1
KW
5112 int ret;
5113
29cdb251 5114 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
07f07615 5115 return 0;
eb489bb1
KW
5116 }
5117
ca716364 5118 /* Write back cached data to the OS even with cache=unsafe */
bf736fe3 5119 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
eb489bb1
KW
5120 if (bs->drv->bdrv_co_flush_to_os) {
5121 ret = bs->drv->bdrv_co_flush_to_os(bs);
5122 if (ret < 0) {
5123 return ret;
5124 }
5125 }
5126
ca716364
KW
5127 /* But don't actually force it to the disk with cache=unsafe */
5128 if (bs->open_flags & BDRV_O_NO_FLUSH) {
d4c82329 5129 goto flush_parent;
ca716364
KW
5130 }
5131
bf736fe3 5132 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
eb489bb1 5133 if (bs->drv->bdrv_co_flush_to_disk) {
29cdb251 5134 ret = bs->drv->bdrv_co_flush_to_disk(bs);
07f07615 5135 } else if (bs->drv->bdrv_aio_flush) {
7c84b1b8 5136 BlockAIOCB *acb;
07f07615
PB
5137 CoroutineIOCompletion co = {
5138 .coroutine = qemu_coroutine_self(),
5139 };
5140
5141 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
5142 if (acb == NULL) {
29cdb251 5143 ret = -EIO;
07f07615
PB
5144 } else {
5145 qemu_coroutine_yield();
29cdb251 5146 ret = co.ret;
07f07615 5147 }
07f07615
PB
5148 } else {
5149 /*
5150 * Some block drivers always operate in either writethrough or unsafe
5151 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
5152 * know how the server works (because the behaviour is hardcoded or
5153 * depends on server-side configuration), so we can't ensure that
5154 * everything is safe on disk. Returning an error doesn't work because
5155 * that would break guests even if the server operates in writethrough
5156 * mode.
5157 *
5158 * Let's hope the user knows what he's doing.
5159 */
29cdb251 5160 ret = 0;
07f07615 5161 }
29cdb251
PB
5162 if (ret < 0) {
5163 return ret;
5164 }
5165
5166 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
5167 * in the case of cache=unsafe, so there are no useless flushes.
5168 */
d4c82329 5169flush_parent:
29cdb251 5170 return bdrv_co_flush(bs->file);
07f07615
PB
5171}
5172
5a8a30db 5173void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
0f15423c 5174{
5a8a30db
KW
5175 Error *local_err = NULL;
5176 int ret;
5177
3456a8d1
KW
5178 if (!bs->drv) {
5179 return;
5180 }
5181
7ea2d269
AK
5182 if (!(bs->open_flags & BDRV_O_INCOMING)) {
5183 return;
5184 }
5185 bs->open_flags &= ~BDRV_O_INCOMING;
5186
3456a8d1 5187 if (bs->drv->bdrv_invalidate_cache) {
5a8a30db 5188 bs->drv->bdrv_invalidate_cache(bs, &local_err);
3456a8d1 5189 } else if (bs->file) {
5a8a30db
KW
5190 bdrv_invalidate_cache(bs->file, &local_err);
5191 }
5192 if (local_err) {
5193 error_propagate(errp, local_err);
5194 return;
0f15423c 5195 }
3456a8d1 5196
5a8a30db
KW
5197 ret = refresh_total_sectors(bs, bs->total_sectors);
5198 if (ret < 0) {
5199 error_setg_errno(errp, -ret, "Could not refresh total sector count");
5200 return;
5201 }
0f15423c
AL
5202}
5203
5a8a30db 5204void bdrv_invalidate_cache_all(Error **errp)
0f15423c
AL
5205{
5206 BlockDriverState *bs;
5a8a30db 5207 Error *local_err = NULL;
0f15423c 5208
dc364f4c 5209 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
5210 AioContext *aio_context = bdrv_get_aio_context(bs);
5211
5212 aio_context_acquire(aio_context);
5a8a30db 5213 bdrv_invalidate_cache(bs, &local_err);
ed78cda3 5214 aio_context_release(aio_context);
5a8a30db
KW
5215 if (local_err) {
5216 error_propagate(errp, local_err);
5217 return;
5218 }
0f15423c
AL
5219 }
5220}
5221
07f07615
PB
5222int bdrv_flush(BlockDriverState *bs)
5223{
5224 Coroutine *co;
5225 RwCo rwco = {
5226 .bs = bs,
5227 .ret = NOT_DONE,
e7a8a783 5228 };
e7a8a783 5229
07f07615
PB
5230 if (qemu_in_coroutine()) {
5231 /* Fast-path if already in coroutine context */
5232 bdrv_flush_co_entry(&rwco);
5233 } else {
2572b37a
SH
5234 AioContext *aio_context = bdrv_get_aio_context(bs);
5235
07f07615
PB
5236 co = qemu_coroutine_create(bdrv_flush_co_entry);
5237 qemu_coroutine_enter(co, &rwco);
5238 while (rwco.ret == NOT_DONE) {
2572b37a 5239 aio_poll(aio_context, true);
07f07615 5240 }
e7a8a783 5241 }
07f07615
PB
5242
5243 return rwco.ret;
e7a8a783
KW
5244}
5245
775aa8b6
KW
5246typedef struct DiscardCo {
5247 BlockDriverState *bs;
5248 int64_t sector_num;
5249 int nb_sectors;
5250 int ret;
5251} DiscardCo;
4265d620
PB
5252static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5253{
775aa8b6 5254 DiscardCo *rwco = opaque;
4265d620
PB
5255
5256 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5257}
5258
5259int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5260 int nb_sectors)
5261{
b9c64947 5262 int max_discard, ret;
d51e9fe5 5263
4265d620
PB
5264 if (!bs->drv) {
5265 return -ENOMEDIUM;
b9c64947
HR
5266 }
5267
5268 ret = bdrv_check_request(bs, sector_num, nb_sectors);
5269 if (ret < 0) {
5270 return ret;
4265d620
PB
5271 } else if (bs->read_only) {
5272 return -EROFS;
df702c9b
PB
5273 }
5274
e4654d2d 5275 bdrv_reset_dirty(bs, sector_num, nb_sectors);
df702c9b 5276
9e8f1835
PB
5277 /* Do nothing if disabled. */
5278 if (!(bs->open_flags & BDRV_O_UNMAP)) {
5279 return 0;
5280 }
5281
d51e9fe5
PB
5282 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5283 return 0;
5284 }
6f14da52 5285
75af1f34 5286 max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
d51e9fe5
PB
5287 while (nb_sectors > 0) {
5288 int ret;
5289 int num = nb_sectors;
6f14da52 5290
d51e9fe5
PB
5291 /* align request */
5292 if (bs->bl.discard_alignment &&
5293 num >= bs->bl.discard_alignment &&
5294 sector_num % bs->bl.discard_alignment) {
5295 if (num > bs->bl.discard_alignment) {
5296 num = bs->bl.discard_alignment;
6f14da52 5297 }
d51e9fe5
PB
5298 num -= sector_num % bs->bl.discard_alignment;
5299 }
6f14da52 5300
d51e9fe5
PB
5301 /* limit request size */
5302 if (num > max_discard) {
5303 num = max_discard;
5304 }
6f14da52 5305
d51e9fe5 5306 if (bs->drv->bdrv_co_discard) {
6f14da52 5307 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
d51e9fe5 5308 } else {
7c84b1b8 5309 BlockAIOCB *acb;
d51e9fe5
PB
5310 CoroutineIOCompletion co = {
5311 .coroutine = qemu_coroutine_self(),
5312 };
5313
5314 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5315 bdrv_co_io_em_complete, &co);
5316 if (acb == NULL) {
5317 return -EIO;
5318 } else {
5319 qemu_coroutine_yield();
5320 ret = co.ret;
6f14da52 5321 }
6f14da52 5322 }
7ce21016 5323 if (ret && ret != -ENOTSUP) {
d51e9fe5 5324 return ret;
4265d620 5325 }
d51e9fe5
PB
5326
5327 sector_num += num;
5328 nb_sectors -= num;
4265d620 5329 }
d51e9fe5 5330 return 0;
4265d620
PB
5331}
5332
5333int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5334{
5335 Coroutine *co;
775aa8b6 5336 DiscardCo rwco = {
4265d620
PB
5337 .bs = bs,
5338 .sector_num = sector_num,
5339 .nb_sectors = nb_sectors,
5340 .ret = NOT_DONE,
5341 };
5342
5343 if (qemu_in_coroutine()) {
5344 /* Fast-path if already in coroutine context */
5345 bdrv_discard_co_entry(&rwco);
5346 } else {
2572b37a
SH
5347 AioContext *aio_context = bdrv_get_aio_context(bs);
5348
4265d620
PB
5349 co = qemu_coroutine_create(bdrv_discard_co_entry);
5350 qemu_coroutine_enter(co, &rwco);
5351 while (rwco.ret == NOT_DONE) {
2572b37a 5352 aio_poll(aio_context, true);
4265d620
PB
5353 }
5354 }
5355
5356 return rwco.ret;
5357}
5358
19cb3738
FB
5359/**************************************************************/
5360/* removable device support */
5361
5362/**
5363 * Return TRUE if the media is present
5364 */
5365int bdrv_is_inserted(BlockDriverState *bs)
5366{
5367 BlockDriver *drv = bs->drv;
a1aff5bf 5368
19cb3738
FB
5369 if (!drv)
5370 return 0;
5371 if (!drv->bdrv_is_inserted)
a1aff5bf
MA
5372 return 1;
5373 return drv->bdrv_is_inserted(bs);
19cb3738
FB
5374}
5375
5376/**
8e49ca46
MA
5377 * Return whether the media changed since the last call to this
5378 * function, or -ENOTSUP if we don't know. Most drivers don't know.
19cb3738
FB
5379 */
5380int bdrv_media_changed(BlockDriverState *bs)
5381{
5382 BlockDriver *drv = bs->drv;
19cb3738 5383
8e49ca46
MA
5384 if (drv && drv->bdrv_media_changed) {
5385 return drv->bdrv_media_changed(bs);
5386 }
5387 return -ENOTSUP;
19cb3738
FB
5388}
5389
5390/**
5391 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5392 */
f36f3949 5393void bdrv_eject(BlockDriverState *bs, bool eject_flag)
19cb3738
FB
5394{
5395 BlockDriver *drv = bs->drv;
bfb197e0 5396 const char *device_name;
19cb3738 5397
822e1cd1
MA
5398 if (drv && drv->bdrv_eject) {
5399 drv->bdrv_eject(bs, eject_flag);
19cb3738 5400 }
6f382ed2 5401
bfb197e0
MA
5402 device_name = bdrv_get_device_name(bs);
5403 if (device_name[0] != '\0') {
5404 qapi_event_send_device_tray_moved(device_name,
a5ee7bd4 5405 eject_flag, &error_abort);
6f382ed2 5406 }
19cb3738
FB
5407}
5408
19cb3738
FB
5409/**
5410 * Lock or unlock the media (if it is locked, the user won't be able
5411 * to eject it manually).
5412 */
025e849a 5413void bdrv_lock_medium(BlockDriverState *bs, bool locked)
19cb3738
FB
5414{
5415 BlockDriver *drv = bs->drv;
5416
025e849a 5417 trace_bdrv_lock_medium(bs, locked);
b8c6d095 5418
025e849a
MA
5419 if (drv && drv->bdrv_lock_medium) {
5420 drv->bdrv_lock_medium(bs, locked);
19cb3738
FB
5421 }
5422}
985a03b0
TS
5423
5424/* needed for generic scsi interface */
5425
5426int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5427{
5428 BlockDriver *drv = bs->drv;
5429
5430 if (drv && drv->bdrv_ioctl)
5431 return drv->bdrv_ioctl(bs, req, buf);
5432 return -ENOTSUP;
5433}
7d780669 5434
7c84b1b8 5435BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
221f715d 5436 unsigned long int req, void *buf,
097310b5 5437 BlockCompletionFunc *cb, void *opaque)
7d780669 5438{
221f715d 5439 BlockDriver *drv = bs->drv;
7d780669 5440
221f715d
AL
5441 if (drv && drv->bdrv_aio_ioctl)
5442 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5443 return NULL;
7d780669 5444}
e268ca52 5445
1b7fd729 5446void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
7b6f9300 5447{
1b7fd729 5448 bs->guest_block_size = align;
7b6f9300 5449}
7cd1e32a 5450
e268ca52
AL
5451void *qemu_blockalign(BlockDriverState *bs, size_t size)
5452{
339064d5 5453 return qemu_memalign(bdrv_opt_mem_align(bs), size);
e268ca52 5454}
7cd1e32a 5455
9ebd8448
HR
5456void *qemu_blockalign0(BlockDriverState *bs, size_t size)
5457{
5458 return memset(qemu_blockalign(bs, size), 0, size);
5459}
5460
7d2a35cc
KW
5461void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5462{
5463 size_t align = bdrv_opt_mem_align(bs);
5464
5465 /* Ensure that NULL is never returned on success */
5466 assert(align > 0);
5467 if (size == 0) {
5468 size = align;
5469 }
5470
5471 return qemu_try_memalign(align, size);
5472}
5473
9ebd8448
HR
5474void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
5475{
5476 void *mem = qemu_try_blockalign(bs, size);
5477
5478 if (mem) {
5479 memset(mem, 0, size);
5480 }
5481
5482 return mem;
5483}
5484
c53b1c51
SH
5485/*
5486 * Check if all memory in this vector is sector aligned.
5487 */
5488bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5489{
5490 int i;
339064d5 5491 size_t alignment = bdrv_opt_mem_align(bs);
c53b1c51
SH
5492
5493 for (i = 0; i < qiov->niov; i++) {
339064d5 5494 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
c53b1c51 5495 return false;
1ff735bd 5496 }
339064d5 5497 if (qiov->iov[i].iov_len % alignment) {
1ff735bd 5498 return false;
c53b1c51
SH
5499 }
5500 }
5501
5502 return true;
5503}
5504
0db6e54a
FZ
5505BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name)
5506{
5507 BdrvDirtyBitmap *bm;
5508
5509 assert(name);
5510 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5511 if (bm->name && !strcmp(name, bm->name)) {
5512 return bm;
5513 }
5514 }
5515 return NULL;
5516}
5517
5518void bdrv_dirty_bitmap_make_anon(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5519{
5520 g_free(bitmap->name);
5521 bitmap->name = NULL;
5522}
5523
5524BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
5525 int granularity,
5526 const char *name,
b8afb520 5527 Error **errp)
7cd1e32a
LS
5528{
5529 int64_t bitmap_size;
e4654d2d 5530 BdrvDirtyBitmap *bitmap;
a55eb92c 5531
50717e94
PB
5532 assert((granularity & (granularity - 1)) == 0);
5533
0db6e54a
FZ
5534 if (name && bdrv_find_dirty_bitmap(bs, name)) {
5535 error_setg(errp, "Bitmap already exists: %s", name);
5536 return NULL;
5537 }
e4654d2d
FZ
5538 granularity >>= BDRV_SECTOR_BITS;
5539 assert(granularity);
57322b78 5540 bitmap_size = bdrv_nb_sectors(bs);
b8afb520
FZ
5541 if (bitmap_size < 0) {
5542 error_setg_errno(errp, -bitmap_size, "could not get length of device");
5543 errno = -bitmap_size;
5544 return NULL;
5545 }
5839e53b 5546 bitmap = g_new0(BdrvDirtyBitmap, 1);
786a4ea8 5547 bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(granularity));
0db6e54a 5548 bitmap->name = g_strdup(name);
e4654d2d
FZ
5549 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5550 return bitmap;
5551}
5552
5553void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5554{
5555 BdrvDirtyBitmap *bm, *next;
5556 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5557 if (bm == bitmap) {
5558 QLIST_REMOVE(bitmap, list);
5559 hbitmap_free(bitmap->bitmap);
0db6e54a 5560 g_free(bitmap->name);
e4654d2d
FZ
5561 g_free(bitmap);
5562 return;
a55eb92c 5563 }
7cd1e32a
LS
5564 }
5565}
5566
21b56835
FZ
5567BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5568{
5569 BdrvDirtyBitmap *bm;
5570 BlockDirtyInfoList *list = NULL;
5571 BlockDirtyInfoList **plist = &list;
5572
5573 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5839e53b
MA
5574 BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5575 BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
21b56835
FZ
5576 info->count = bdrv_get_dirty_count(bs, bm);
5577 info->granularity =
5578 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
0db6e54a
FZ
5579 info->has_name = !!bm->name;
5580 info->name = g_strdup(bm->name);
21b56835
FZ
5581 entry->value = info;
5582 *plist = entry;
5583 plist = &entry->next;
5584 }
5585
5586 return list;
5587}
5588
e4654d2d 5589int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
7cd1e32a 5590{
e4654d2d
FZ
5591 if (bitmap) {
5592 return hbitmap_get(bitmap->bitmap, sector);
7cd1e32a
LS
5593 } else {
5594 return 0;
5595 }
5596}
5597
e4654d2d
FZ
5598void bdrv_dirty_iter_init(BlockDriverState *bs,
5599 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
1755da16 5600{
e4654d2d 5601 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
1755da16
PB
5602}
5603
c4237dfa
VSO
5604void bdrv_set_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
5605 int64_t cur_sector, int nr_sectors)
5606{
5607 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5608}
5609
5610void bdrv_reset_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
5611 int64_t cur_sector, int nr_sectors)
5612{
5613 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5614}
5615
5616static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5617 int nr_sectors)
1755da16 5618{
e4654d2d
FZ
5619 BdrvDirtyBitmap *bitmap;
5620 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5621 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5622 }
1755da16
PB
5623}
5624
c4237dfa
VSO
5625static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
5626 int nr_sectors)
7cd1e32a 5627{
e4654d2d
FZ
5628 BdrvDirtyBitmap *bitmap;
5629 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5630 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5631 }
7cd1e32a 5632}
aaa0eb75 5633
e4654d2d 5634int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
aaa0eb75 5635{
e4654d2d 5636 return hbitmap_count(bitmap->bitmap);
aaa0eb75 5637}
f88e1a42 5638
9fcb0251
FZ
5639/* Get a reference to bs */
5640void bdrv_ref(BlockDriverState *bs)
5641{
5642 bs->refcnt++;
5643}
5644
5645/* Release a previously grabbed reference to bs.
5646 * If after releasing, reference count is zero, the BlockDriverState is
5647 * deleted. */
5648void bdrv_unref(BlockDriverState *bs)
5649{
9a4d5ca6
JC
5650 if (!bs) {
5651 return;
5652 }
9fcb0251
FZ
5653 assert(bs->refcnt > 0);
5654 if (--bs->refcnt == 0) {
5655 bdrv_delete(bs);
5656 }
5657}
5658
fbe40ff7
FZ
5659struct BdrvOpBlocker {
5660 Error *reason;
5661 QLIST_ENTRY(BdrvOpBlocker) list;
5662};
5663
5664bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5665{
5666 BdrvOpBlocker *blocker;
5667 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5668 if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5669 blocker = QLIST_FIRST(&bs->op_blockers[op]);
5670 if (errp) {
81e5f78a
AG
5671 error_setg(errp, "Node '%s' is busy: %s",
5672 bdrv_get_device_or_node_name(bs),
bfb197e0 5673 error_get_pretty(blocker->reason));
fbe40ff7
FZ
5674 }
5675 return true;
5676 }
5677 return false;
5678}
5679
5680void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5681{
5682 BdrvOpBlocker *blocker;
5683 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5684
5839e53b 5685 blocker = g_new0(BdrvOpBlocker, 1);
fbe40ff7
FZ
5686 blocker->reason = reason;
5687 QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5688}
5689
5690void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5691{
5692 BdrvOpBlocker *blocker, *next;
5693 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5694 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5695 if (blocker->reason == reason) {
5696 QLIST_REMOVE(blocker, list);
5697 g_free(blocker);
5698 }
5699 }
5700}
5701
5702void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5703{
5704 int i;
5705 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5706 bdrv_op_block(bs, i, reason);
5707 }
5708}
5709
5710void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5711{
5712 int i;
5713 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5714 bdrv_op_unblock(bs, i, reason);
5715 }
5716}
5717
5718bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5719{
5720 int i;
5721
5722 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5723 if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5724 return false;
5725 }
5726 }
5727 return true;
5728}
5729
28a7282a
LC
5730void bdrv_iostatus_enable(BlockDriverState *bs)
5731{
d6bf279e 5732 bs->iostatus_enabled = true;
58e21ef5 5733 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
28a7282a
LC
5734}
5735
5736/* The I/O status is only enabled if the drive explicitly
5737 * enables it _and_ the VM is configured to stop on errors */
5738bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5739{
d6bf279e 5740 return (bs->iostatus_enabled &&
92aa5c6d
PB
5741 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5742 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5743 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
28a7282a
LC
5744}
5745
5746void bdrv_iostatus_disable(BlockDriverState *bs)
5747{
d6bf279e 5748 bs->iostatus_enabled = false;
28a7282a
LC
5749}
5750
5751void bdrv_iostatus_reset(BlockDriverState *bs)
5752{
5753 if (bdrv_iostatus_is_enabled(bs)) {
58e21ef5 5754 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3bd293c3
PB
5755 if (bs->job) {
5756 block_job_iostatus_reset(bs->job);
5757 }
28a7282a
LC
5758 }
5759}
5760
28a7282a
LC
5761void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5762{
3e1caa5f
PB
5763 assert(bdrv_iostatus_is_enabled(bs));
5764 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
58e21ef5
LC
5765 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5766 BLOCK_DEVICE_IO_STATUS_FAILED;
28a7282a
LC
5767 }
5768}
5769
d92ada22
LC
5770void bdrv_img_create(const char *filename, const char *fmt,
5771 const char *base_filename, const char *base_fmt,
f382d43a
MR
5772 char *options, uint64_t img_size, int flags,
5773 Error **errp, bool quiet)
f88e1a42 5774{
83d0521a
CL
5775 QemuOptsList *create_opts = NULL;
5776 QemuOpts *opts = NULL;
5777 const char *backing_fmt, *backing_file;
5778 int64_t size;
f88e1a42 5779 BlockDriver *drv, *proto_drv;
96df67d1 5780 BlockDriver *backing_drv = NULL;
cc84d90f 5781 Error *local_err = NULL;
f88e1a42
JS
5782 int ret = 0;
5783
5784 /* Find driver and parse its options */
5785 drv = bdrv_find_format(fmt);
5786 if (!drv) {
71c79813 5787 error_setg(errp, "Unknown file format '%s'", fmt);
d92ada22 5788 return;
f88e1a42
JS
5789 }
5790
b65a5e12 5791 proto_drv = bdrv_find_protocol(filename, true, errp);
f88e1a42 5792 if (!proto_drv) {
d92ada22 5793 return;
f88e1a42
JS
5794 }
5795
c6149724
HR
5796 if (!drv->create_opts) {
5797 error_setg(errp, "Format driver '%s' does not support image creation",
5798 drv->format_name);
5799 return;
5800 }
5801
5802 if (!proto_drv->create_opts) {
5803 error_setg(errp, "Protocol driver '%s' does not support image creation",
5804 proto_drv->format_name);
5805 return;
5806 }
5807
c282e1fd
CL
5808 create_opts = qemu_opts_append(create_opts, drv->create_opts);
5809 create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
f88e1a42
JS
5810
5811 /* Create parameter list with default values */
83d0521a 5812 opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
39101f25 5813 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size, &error_abort);
f88e1a42
JS
5814
5815 /* Parse -o options */
5816 if (options) {
dc523cd3
MA
5817 qemu_opts_do_parse(opts, options, NULL, &local_err);
5818 if (local_err) {
5819 error_report_err(local_err);
5820 local_err = NULL;
83d0521a 5821 error_setg(errp, "Invalid options for file format '%s'", fmt);
f88e1a42
JS
5822 goto out;
5823 }
5824 }
5825
5826 if (base_filename) {
f43e47db 5827 qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename, &local_err);
6be4194b 5828 if (local_err) {
71c79813
LC
5829 error_setg(errp, "Backing file not supported for file format '%s'",
5830 fmt);
f88e1a42
JS
5831 goto out;
5832 }
5833 }
5834
5835 if (base_fmt) {
f43e47db 5836 qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt, &local_err);
6be4194b 5837 if (local_err) {
71c79813
LC
5838 error_setg(errp, "Backing file format not supported for file "
5839 "format '%s'", fmt);
f88e1a42
JS
5840 goto out;
5841 }
5842 }
5843
83d0521a
CL
5844 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5845 if (backing_file) {
5846 if (!strcmp(filename, backing_file)) {
71c79813
LC
5847 error_setg(errp, "Error: Trying to create an image with the "
5848 "same filename as the backing file");
792da93a
JS
5849 goto out;
5850 }
5851 }
5852
83d0521a
CL
5853 backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5854 if (backing_fmt) {
5855 backing_drv = bdrv_find_format(backing_fmt);
96df67d1 5856 if (!backing_drv) {
71c79813 5857 error_setg(errp, "Unknown backing file format '%s'",
83d0521a 5858 backing_fmt);
f88e1a42
JS
5859 goto out;
5860 }
5861 }
5862
5863 // The size for the image must always be specified, with one exception:
5864 // If we are using a backing file, we can obtain the size from there
83d0521a
CL
5865 size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5866 if (size == -1) {
5867 if (backing_file) {
66f6b814 5868 BlockDriverState *bs;
29168018 5869 char *full_backing = g_new0(char, PATH_MAX);
52bf1e72 5870 int64_t size;
63090dac
PB
5871 int back_flags;
5872
29168018
HR
5873 bdrv_get_full_backing_filename_from_filename(filename, backing_file,
5874 full_backing, PATH_MAX,
5875 &local_err);
5876 if (local_err) {
5877 g_free(full_backing);
5878 goto out;
5879 }
5880
63090dac
PB
5881 /* backing files always opened read-only */
5882 back_flags =
5883 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
f88e1a42 5884
f67503e5 5885 bs = NULL;
29168018 5886 ret = bdrv_open(&bs, full_backing, NULL, NULL, back_flags,
cc84d90f 5887 backing_drv, &local_err);
29168018 5888 g_free(full_backing);
f88e1a42 5889 if (ret < 0) {
f88e1a42
JS
5890 goto out;
5891 }
52bf1e72
MA
5892 size = bdrv_getlength(bs);
5893 if (size < 0) {
5894 error_setg_errno(errp, -size, "Could not get size of '%s'",
5895 backing_file);
5896 bdrv_unref(bs);
5897 goto out;
5898 }
f88e1a42 5899
39101f25 5900 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size, &error_abort);
66f6b814
HR
5901
5902 bdrv_unref(bs);
f88e1a42 5903 } else {
71c79813 5904 error_setg(errp, "Image creation needs a size parameter");
f88e1a42
JS
5905 goto out;
5906 }
5907 }
5908
f382d43a 5909 if (!quiet) {
43c5d8f8
FZ
5910 printf("Formatting '%s', fmt=%s", filename, fmt);
5911 qemu_opts_print(opts, " ");
f382d43a
MR
5912 puts("");
5913 }
83d0521a 5914
c282e1fd 5915 ret = bdrv_create(drv, filename, opts, &local_err);
83d0521a 5916
cc84d90f
HR
5917 if (ret == -EFBIG) {
5918 /* This is generally a better message than whatever the driver would
5919 * deliver (especially because of the cluster_size_hint), since that
5920 * is most probably not much different from "image too large". */
5921 const char *cluster_size_hint = "";
83d0521a 5922 if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
cc84d90f 5923 cluster_size_hint = " (try using a larger cluster size)";
f88e1a42 5924 }
cc84d90f
HR
5925 error_setg(errp, "The image size is too large for file format '%s'"
5926 "%s", fmt, cluster_size_hint);
5927 error_free(local_err);
5928 local_err = NULL;
f88e1a42
JS
5929 }
5930
5931out:
83d0521a
CL
5932 qemu_opts_del(opts);
5933 qemu_opts_free(create_opts);
84d18f06 5934 if (local_err) {
cc84d90f
HR
5935 error_propagate(errp, local_err);
5936 }
f88e1a42 5937}
85d126f3
SH
5938
5939AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5940{
dcd04228
SH
5941 return bs->aio_context;
5942}
5943
5944void bdrv_detach_aio_context(BlockDriverState *bs)
5945{
33384421
HR
5946 BdrvAioNotifier *baf;
5947
dcd04228
SH
5948 if (!bs->drv) {
5949 return;
5950 }
5951
33384421
HR
5952 QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
5953 baf->detach_aio_context(baf->opaque);
5954 }
5955
13af91eb
SH
5956 if (bs->io_limits_enabled) {
5957 throttle_detach_aio_context(&bs->throttle_state);
5958 }
dcd04228
SH
5959 if (bs->drv->bdrv_detach_aio_context) {
5960 bs->drv->bdrv_detach_aio_context(bs);
5961 }
5962 if (bs->file) {
5963 bdrv_detach_aio_context(bs->file);
5964 }
5965 if (bs->backing_hd) {
5966 bdrv_detach_aio_context(bs->backing_hd);
5967 }
5968
5969 bs->aio_context = NULL;
5970}
5971
5972void bdrv_attach_aio_context(BlockDriverState *bs,
5973 AioContext *new_context)
5974{
33384421
HR
5975 BdrvAioNotifier *ban;
5976
dcd04228
SH
5977 if (!bs->drv) {
5978 return;
5979 }
5980
5981 bs->aio_context = new_context;
5982
5983 if (bs->backing_hd) {
5984 bdrv_attach_aio_context(bs->backing_hd, new_context);
5985 }
5986 if (bs->file) {
5987 bdrv_attach_aio_context(bs->file, new_context);
5988 }
5989 if (bs->drv->bdrv_attach_aio_context) {
5990 bs->drv->bdrv_attach_aio_context(bs, new_context);
5991 }
13af91eb
SH
5992 if (bs->io_limits_enabled) {
5993 throttle_attach_aio_context(&bs->throttle_state, new_context);
5994 }
33384421
HR
5995
5996 QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
5997 ban->attached_aio_context(new_context, ban->opaque);
5998 }
dcd04228
SH
5999}
6000
6001void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
6002{
6003 bdrv_drain_all(); /* ensure there are no in-flight requests */
6004
6005 bdrv_detach_aio_context(bs);
6006
6007 /* This function executes in the old AioContext so acquire the new one in
6008 * case it runs in a different thread.
6009 */
6010 aio_context_acquire(new_context);
6011 bdrv_attach_aio_context(bs, new_context);
6012 aio_context_release(new_context);
85d126f3 6013}
d616b224 6014
33384421
HR
6015void bdrv_add_aio_context_notifier(BlockDriverState *bs,
6016 void (*attached_aio_context)(AioContext *new_context, void *opaque),
6017 void (*detach_aio_context)(void *opaque), void *opaque)
6018{
6019 BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
6020 *ban = (BdrvAioNotifier){
6021 .attached_aio_context = attached_aio_context,
6022 .detach_aio_context = detach_aio_context,
6023 .opaque = opaque
6024 };
6025
6026 QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
6027}
6028
6029void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
6030 void (*attached_aio_context)(AioContext *,
6031 void *),
6032 void (*detach_aio_context)(void *),
6033 void *opaque)
6034{
6035 BdrvAioNotifier *ban, *ban_next;
6036
6037 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
6038 if (ban->attached_aio_context == attached_aio_context &&
6039 ban->detach_aio_context == detach_aio_context &&
6040 ban->opaque == opaque)
6041 {
6042 QLIST_REMOVE(ban, list);
6043 g_free(ban);
6044
6045 return;
6046 }
6047 }
6048
6049 abort();
6050}
6051
d616b224
SH
6052void bdrv_add_before_write_notifier(BlockDriverState *bs,
6053 NotifierWithReturn *notifier)
6054{
6055 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
6056}
6f176b48 6057
77485434
HR
6058int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
6059 BlockDriverAmendStatusCB *status_cb)
6f176b48 6060{
c282e1fd 6061 if (!bs->drv->bdrv_amend_options) {
6f176b48
HR
6062 return -ENOTSUP;
6063 }
77485434 6064 return bs->drv->bdrv_amend_options(bs, opts, status_cb);
6f176b48 6065}
f6186f49 6066
b5042a36
BC
6067/* This function will be called by the bdrv_recurse_is_first_non_filter method
6068 * of block filter and by bdrv_is_first_non_filter.
6069 * It is used to test if the given bs is the candidate or recurse more in the
6070 * node graph.
212a5a8f 6071 */
b5042a36 6072bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
212a5a8f 6073 BlockDriverState *candidate)
f6186f49 6074{
b5042a36
BC
6075 /* return false if basic checks fails */
6076 if (!bs || !bs->drv) {
212a5a8f 6077 return false;
f6186f49
BC
6078 }
6079
b5042a36
BC
6080 /* the code reached a non block filter driver -> check if the bs is
6081 * the same as the candidate. It's the recursion termination condition.
6082 */
6083 if (!bs->drv->is_filter) {
6084 return bs == candidate;
212a5a8f 6085 }
b5042a36 6086 /* Down this path the driver is a block filter driver */
212a5a8f 6087
b5042a36
BC
6088 /* If the block filter recursion method is defined use it to recurse down
6089 * the node graph.
6090 */
6091 if (bs->drv->bdrv_recurse_is_first_non_filter) {
212a5a8f 6092 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
f6186f49
BC
6093 }
6094
b5042a36
BC
6095 /* the driver is a block filter but don't allow to recurse -> return false
6096 */
6097 return false;
f6186f49
BC
6098}
6099
212a5a8f
BC
6100/* This function checks if the candidate is the first non filter bs down it's
6101 * bs chain. Since we don't have pointers to parents it explore all bs chains
6102 * from the top. Some filters can choose not to pass down the recursion.
6103 */
6104bool bdrv_is_first_non_filter(BlockDriverState *candidate)
f6186f49 6105{
212a5a8f
BC
6106 BlockDriverState *bs;
6107
6108 /* walk down the bs forest recursively */
6109 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
6110 bool perm;
6111
b5042a36 6112 /* try to recurse in this top level bs */
e6dc8a1f 6113 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
212a5a8f
BC
6114
6115 /* candidate is the first non filter */
6116 if (perm) {
6117 return true;
6118 }
6119 }
6120
6121 return false;
f6186f49 6122}
09158f00
BC
6123
6124BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
6125{
6126 BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
5a7e7a0b
SH
6127 AioContext *aio_context;
6128
09158f00
BC
6129 if (!to_replace_bs) {
6130 error_setg(errp, "Node name '%s' not found", node_name);
6131 return NULL;
6132 }
6133
5a7e7a0b
SH
6134 aio_context = bdrv_get_aio_context(to_replace_bs);
6135 aio_context_acquire(aio_context);
6136
09158f00 6137 if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
5a7e7a0b
SH
6138 to_replace_bs = NULL;
6139 goto out;
09158f00
BC
6140 }
6141
6142 /* We don't want arbitrary node of the BDS chain to be replaced only the top
6143 * most non filter in order to prevent data corruption.
6144 * Another benefit is that this tests exclude backing files which are
6145 * blocked by the backing blockers.
6146 */
6147 if (!bdrv_is_first_non_filter(to_replace_bs)) {
6148 error_setg(errp, "Only top most non filter can be replaced");
5a7e7a0b
SH
6149 to_replace_bs = NULL;
6150 goto out;
09158f00
BC
6151 }
6152
5a7e7a0b
SH
6153out:
6154 aio_context_release(aio_context);
09158f00
BC
6155 return to_replace_bs;
6156}
448ad91d
ML
6157
6158void bdrv_io_plug(BlockDriverState *bs)
6159{
6160 BlockDriver *drv = bs->drv;
6161 if (drv && drv->bdrv_io_plug) {
6162 drv->bdrv_io_plug(bs);
6163 } else if (bs->file) {
6164 bdrv_io_plug(bs->file);
6165 }
6166}
6167
6168void bdrv_io_unplug(BlockDriverState *bs)
6169{
6170 BlockDriver *drv = bs->drv;
6171 if (drv && drv->bdrv_io_unplug) {
6172 drv->bdrv_io_unplug(bs);
6173 } else if (bs->file) {
6174 bdrv_io_unplug(bs->file);
6175 }
6176}
6177
6178void bdrv_flush_io_queue(BlockDriverState *bs)
6179{
6180 BlockDriver *drv = bs->drv;
6181 if (drv && drv->bdrv_flush_io_queue) {
6182 drv->bdrv_flush_io_queue(bs);
6183 } else if (bs->file) {
6184 bdrv_flush_io_queue(bs->file);
6185 }
6186}
91af7014
HR
6187
6188static bool append_open_options(QDict *d, BlockDriverState *bs)
6189{
6190 const QDictEntry *entry;
6191 bool found_any = false;
6192
6193 for (entry = qdict_first(bs->options); entry;
6194 entry = qdict_next(bs->options, entry))
6195 {
6196 /* Only take options for this level and exclude all non-driver-specific
6197 * options */
6198 if (!strchr(qdict_entry_key(entry), '.') &&
6199 strcmp(qdict_entry_key(entry), "node-name"))
6200 {
6201 qobject_incref(qdict_entry_value(entry));
6202 qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
6203 found_any = true;
6204 }
6205 }
6206
6207 return found_any;
6208}
6209
6210/* Updates the following BDS fields:
6211 * - exact_filename: A filename which may be used for opening a block device
6212 * which (mostly) equals the given BDS (even without any
6213 * other options; so reading and writing must return the same
6214 * results, but caching etc. may be different)
6215 * - full_open_options: Options which, when given when opening a block device
6216 * (without a filename), result in a BDS (mostly)
6217 * equalling the given one
6218 * - filename: If exact_filename is set, it is copied here. Otherwise,
6219 * full_open_options is converted to a JSON object, prefixed with
6220 * "json:" (for use through the JSON pseudo protocol) and put here.
6221 */
6222void bdrv_refresh_filename(BlockDriverState *bs)
6223{
6224 BlockDriver *drv = bs->drv;
6225 QDict *opts;
6226
6227 if (!drv) {
6228 return;
6229 }
6230
6231 /* This BDS's file name will most probably depend on its file's name, so
6232 * refresh that first */
6233 if (bs->file) {
6234 bdrv_refresh_filename(bs->file);
6235 }
6236
6237 if (drv->bdrv_refresh_filename) {
6238 /* Obsolete information is of no use here, so drop the old file name
6239 * information before refreshing it */
6240 bs->exact_filename[0] = '\0';
6241 if (bs->full_open_options) {
6242 QDECREF(bs->full_open_options);
6243 bs->full_open_options = NULL;
6244 }
6245
6246 drv->bdrv_refresh_filename(bs);
6247 } else if (bs->file) {
6248 /* Try to reconstruct valid information from the underlying file */
6249 bool has_open_options;
6250
6251 bs->exact_filename[0] = '\0';
6252 if (bs->full_open_options) {
6253 QDECREF(bs->full_open_options);
6254 bs->full_open_options = NULL;
6255 }
6256
6257 opts = qdict_new();
6258 has_open_options = append_open_options(opts, bs);
6259
6260 /* If no specific options have been given for this BDS, the filename of
6261 * the underlying file should suffice for this one as well */
6262 if (bs->file->exact_filename[0] && !has_open_options) {
6263 strcpy(bs->exact_filename, bs->file->exact_filename);
6264 }
6265 /* Reconstructing the full options QDict is simple for most format block
6266 * drivers, as long as the full options are known for the underlying
6267 * file BDS. The full options QDict of that file BDS should somehow
6268 * contain a representation of the filename, therefore the following
6269 * suffices without querying the (exact_)filename of this BDS. */
6270 if (bs->file->full_open_options) {
6271 qdict_put_obj(opts, "driver",
6272 QOBJECT(qstring_from_str(drv->format_name)));
6273 QINCREF(bs->file->full_open_options);
6274 qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
6275
6276 bs->full_open_options = opts;
6277 } else {
6278 QDECREF(opts);
6279 }
6280 } else if (!bs->full_open_options && qdict_size(bs->options)) {
6281 /* There is no underlying file BDS (at least referenced by BDS.file),
6282 * so the full options QDict should be equal to the options given
6283 * specifically for this block device when it was opened (plus the
6284 * driver specification).
6285 * Because those options don't change, there is no need to update
6286 * full_open_options when it's already set. */
6287
6288 opts = qdict_new();
6289 append_open_options(opts, bs);
6290 qdict_put_obj(opts, "driver",
6291 QOBJECT(qstring_from_str(drv->format_name)));
6292
6293 if (bs->exact_filename[0]) {
6294 /* This may not work for all block protocol drivers (some may
6295 * require this filename to be parsed), but we have to find some
6296 * default solution here, so just include it. If some block driver
6297 * does not support pure options without any filename at all or
6298 * needs some special format of the options QDict, it needs to
6299 * implement the driver-specific bdrv_refresh_filename() function.
6300 */
6301 qdict_put_obj(opts, "filename",
6302 QOBJECT(qstring_from_str(bs->exact_filename)));
6303 }
6304
6305 bs->full_open_options = opts;
6306 }
6307
6308 if (bs->exact_filename[0]) {
6309 pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
6310 } else if (bs->full_open_options) {
6311 QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6312 snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6313 qstring_get_str(json));
6314 QDECREF(json);
6315 }
6316}
5366d0c8
BC
6317
6318/* This accessor function purpose is to allow the device models to access the
6319 * BlockAcctStats structure embedded inside a BlockDriverState without being
6320 * aware of the BlockDriverState structure layout.
6321 * It will go away when the BlockAcctStats structure will be moved inside
6322 * the device models.
6323 */
6324BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
6325{
6326 return &bs->stats;
6327}