]> git.proxmox.com Git - mirror_qemu.git/blame - block.c
block: Connect BlockBackend to BlockDriverState
[mirror_qemu.git] / block.c
CommitLineData
fc01f7e7
FB
1/*
2 * QEMU System Emulator block driver
5fafdf24 3 *
fc01f7e7 4 * Copyright (c) 2003 Fabrice Bellard
5fafdf24 5 *
fc01f7e7
FB
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
3990d09a 24#include "config-host.h"
faf07963 25#include "qemu-common.h"
6d519a5f 26#include "trace.h"
737e150e
PB
27#include "block/block_int.h"
28#include "block/blockjob.h"
1de7afc9 29#include "qemu/module.h"
7b1b5d19 30#include "qapi/qmp/qjson.h"
9c17d615 31#include "sysemu/sysemu.h"
3ae59580 32#include "sysemu/blockdev.h" /* FIXME layering violation */
1de7afc9 33#include "qemu/notify.h"
737e150e 34#include "block/coroutine.h"
c13163fb 35#include "block/qapi.h"
b2023818 36#include "qmp-commands.h"
1de7afc9 37#include "qemu/timer.h"
a5ee7bd4 38#include "qapi-event.h"
fc01f7e7 39
71e72a19 40#ifdef CONFIG_BSD
7674e7bf
FB
41#include <sys/types.h>
42#include <sys/stat.h>
43#include <sys/ioctl.h>
72cf2d4f 44#include <sys/queue.h>
c5e97233 45#ifndef __DragonFly__
7674e7bf
FB
46#include <sys/disk.h>
47#endif
c5e97233 48#endif
7674e7bf 49
49dc768d
AL
50#ifdef _WIN32
51#include <windows.h>
52#endif
53
e4654d2d
FZ
54struct BdrvDirtyBitmap {
55 HBitmap *bitmap;
56 QLIST_ENTRY(BdrvDirtyBitmap) list;
57};
58
1c9805a3
SH
59#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
60
2a87151f
SH
61#define COROUTINE_POOL_RESERVATION 64 /* number of coroutines to reserve */
62
7d4b4ba5 63static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
f141eafe
AL
64static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
65 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
c87c0672 66 BlockDriverCompletionFunc *cb, void *opaque);
f141eafe
AL
67static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
68 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 69 BlockDriverCompletionFunc *cb, void *opaque);
f9f05dc5
KW
70static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
71 int64_t sector_num, int nb_sectors,
72 QEMUIOVector *iov);
73static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
74 int64_t sector_num, int nb_sectors,
75 QEMUIOVector *iov);
775aa8b6
KW
76static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
77 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
470c0504 78 BdrvRequestFlags flags);
775aa8b6
KW
79static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
80 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
f08f2dda 81 BdrvRequestFlags flags);
b2a61371
SH
82static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
83 int64_t sector_num,
84 QEMUIOVector *qiov,
85 int nb_sectors,
d20d9b7c 86 BdrvRequestFlags flags,
b2a61371
SH
87 BlockDriverCompletionFunc *cb,
88 void *opaque,
8c5873d6 89 bool is_write);
b2a61371 90static void coroutine_fn bdrv_co_do_rw(void *opaque);
621f0589 91static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
aa7bfbff 92 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
ec530c81 93
1b7bdbc1
SH
94static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
95 QTAILQ_HEAD_INITIALIZER(bdrv_states);
7ee930d0 96
dc364f4c
BC
97static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
98 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
99
8a22f02a
SH
100static QLIST_HEAD(, BlockDriver) bdrv_drivers =
101 QLIST_HEAD_INITIALIZER(bdrv_drivers);
ea2384d3 102
eb852011
MA
103/* If non-zero, use only whitelisted block drivers */
104static int use_bdrv_whitelist;
105
9e0b22f4
SH
106#ifdef _WIN32
107static int is_windows_drive_prefix(const char *filename)
108{
109 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
110 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
111 filename[1] == ':');
112}
113
114int is_windows_drive(const char *filename)
115{
116 if (is_windows_drive_prefix(filename) &&
117 filename[2] == '\0')
118 return 1;
119 if (strstart(filename, "\\\\.\\", NULL) ||
120 strstart(filename, "//./", NULL))
121 return 1;
122 return 0;
123}
124#endif
125
0563e191 126/* throttling disk I/O limits */
cc0681c4
BC
127void bdrv_set_io_limits(BlockDriverState *bs,
128 ThrottleConfig *cfg)
98f90dba 129{
cc0681c4 130 int i;
98f90dba 131
cc0681c4 132 throttle_config(&bs->throttle_state, cfg);
98f90dba 133
cc0681c4
BC
134 for (i = 0; i < 2; i++) {
135 qemu_co_enter_next(&bs->throttled_reqs[i]);
98f90dba 136 }
cc0681c4
BC
137}
138
139/* this function drain all the throttled IOs */
140static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
141{
142 bool drained = false;
143 bool enabled = bs->io_limits_enabled;
144 int i;
145
146 bs->io_limits_enabled = false;
147
148 for (i = 0; i < 2; i++) {
149 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
150 drained = true;
151 }
152 }
153
154 bs->io_limits_enabled = enabled;
98f90dba 155
cc0681c4 156 return drained;
98f90dba
ZYW
157}
158
cc0681c4 159void bdrv_io_limits_disable(BlockDriverState *bs)
0563e191 160{
cc0681c4 161 bs->io_limits_enabled = false;
0563e191 162
cc0681c4
BC
163 bdrv_start_throttled_reqs(bs);
164
165 throttle_destroy(&bs->throttle_state);
0563e191
ZYW
166}
167
cc0681c4 168static void bdrv_throttle_read_timer_cb(void *opaque)
0563e191 169{
cc0681c4
BC
170 BlockDriverState *bs = opaque;
171 qemu_co_enter_next(&bs->throttled_reqs[0]);
0563e191
ZYW
172}
173
cc0681c4 174static void bdrv_throttle_write_timer_cb(void *opaque)
0563e191 175{
cc0681c4
BC
176 BlockDriverState *bs = opaque;
177 qemu_co_enter_next(&bs->throttled_reqs[1]);
0563e191
ZYW
178}
179
cc0681c4
BC
180/* should be called before bdrv_set_io_limits if a limit is set */
181void bdrv_io_limits_enable(BlockDriverState *bs)
182{
183 assert(!bs->io_limits_enabled);
184 throttle_init(&bs->throttle_state,
13af91eb 185 bdrv_get_aio_context(bs),
cc0681c4
BC
186 QEMU_CLOCK_VIRTUAL,
187 bdrv_throttle_read_timer_cb,
188 bdrv_throttle_write_timer_cb,
189 bs);
190 bs->io_limits_enabled = true;
191}
192
193/* This function makes an IO wait if needed
194 *
195 * @nb_sectors: the number of sectors of the IO
196 * @is_write: is the IO a write
197 */
98f90dba 198static void bdrv_io_limits_intercept(BlockDriverState *bs,
d5103588 199 unsigned int bytes,
cc0681c4 200 bool is_write)
98f90dba 201{
cc0681c4
BC
202 /* does this io must wait */
203 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
98f90dba 204
cc0681c4
BC
205 /* if must wait or any request of this type throttled queue the IO */
206 if (must_wait ||
207 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
208 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
98f90dba
ZYW
209 }
210
cc0681c4 211 /* the IO will be executed, do the accounting */
d5103588
KW
212 throttle_account(&bs->throttle_state, is_write, bytes);
213
98f90dba 214
cc0681c4
BC
215 /* if the next request must wait -> do nothing */
216 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
217 return;
98f90dba
ZYW
218 }
219
cc0681c4
BC
220 /* else queue next request for execution */
221 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
98f90dba
ZYW
222}
223
339064d5
KW
224size_t bdrv_opt_mem_align(BlockDriverState *bs)
225{
226 if (!bs || !bs->drv) {
227 /* 4k should be on the safe side */
228 return 4096;
229 }
230
231 return bs->bl.opt_mem_alignment;
232}
233
9e0b22f4
SH
234/* check if the path starts with "<protocol>:" */
235static int path_has_protocol(const char *path)
236{
947995c0
PB
237 const char *p;
238
9e0b22f4
SH
239#ifdef _WIN32
240 if (is_windows_drive(path) ||
241 is_windows_drive_prefix(path)) {
242 return 0;
243 }
947995c0
PB
244 p = path + strcspn(path, ":/\\");
245#else
246 p = path + strcspn(path, ":/");
9e0b22f4
SH
247#endif
248
947995c0 249 return *p == ':';
9e0b22f4
SH
250}
251
83f64091 252int path_is_absolute(const char *path)
3b0d4f61 253{
21664424
FB
254#ifdef _WIN32
255 /* specific case for names like: "\\.\d:" */
f53f4da9 256 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
21664424 257 return 1;
f53f4da9
PB
258 }
259 return (*path == '/' || *path == '\\');
3b9f94e1 260#else
f53f4da9 261 return (*path == '/');
3b9f94e1 262#endif
3b0d4f61
FB
263}
264
83f64091
FB
265/* if filename is absolute, just copy it to dest. Otherwise, build a
266 path to it by considering it is relative to base_path. URL are
267 supported. */
268void path_combine(char *dest, int dest_size,
269 const char *base_path,
270 const char *filename)
3b0d4f61 271{
83f64091
FB
272 const char *p, *p1;
273 int len;
274
275 if (dest_size <= 0)
276 return;
277 if (path_is_absolute(filename)) {
278 pstrcpy(dest, dest_size, filename);
279 } else {
280 p = strchr(base_path, ':');
281 if (p)
282 p++;
283 else
284 p = base_path;
3b9f94e1
FB
285 p1 = strrchr(base_path, '/');
286#ifdef _WIN32
287 {
288 const char *p2;
289 p2 = strrchr(base_path, '\\');
290 if (!p1 || p2 > p1)
291 p1 = p2;
292 }
293#endif
83f64091
FB
294 if (p1)
295 p1++;
296 else
297 p1 = base_path;
298 if (p1 > p)
299 p = p1;
300 len = p - base_path;
301 if (len > dest_size - 1)
302 len = dest_size - 1;
303 memcpy(dest, base_path, len);
304 dest[len] = '\0';
305 pstrcat(dest, dest_size, filename);
3b0d4f61 306 }
3b0d4f61
FB
307}
308
dc5a1371
PB
309void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
310{
311 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
312 pstrcpy(dest, sz, bs->backing_file);
313 } else {
314 path_combine(dest, sz, bs->filename, bs->backing_file);
315 }
316}
317
5efa9d5a 318void bdrv_register(BlockDriver *bdrv)
ea2384d3 319{
8c5873d6
SH
320 /* Block drivers without coroutine functions need emulation */
321 if (!bdrv->bdrv_co_readv) {
f9f05dc5
KW
322 bdrv->bdrv_co_readv = bdrv_co_readv_em;
323 bdrv->bdrv_co_writev = bdrv_co_writev_em;
324
f8c35c1d
SH
325 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
326 * the block driver lacks aio we need to emulate that too.
327 */
f9f05dc5
KW
328 if (!bdrv->bdrv_aio_readv) {
329 /* add AIO emulation layer */
330 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
331 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
f9f05dc5 332 }
83f64091 333 }
b2e12bc6 334
8a22f02a 335 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
ea2384d3 336}
b338082b
FB
337
338/* create a new block device (by default it is empty) */
e4e9986b 339BlockDriverState *bdrv_new_root(const char *device_name, Error **errp)
b338082b 340{
1b7bdbc1 341 BlockDriverState *bs;
e4e9986b
MA
342
343 assert(*device_name);
b338082b 344
f5bebbbb 345 if (*device_name && !id_wellformed(device_name)) {
9aebf3b8
KW
346 error_setg(errp, "Invalid device name");
347 return NULL;
348 }
349
f2d953ec
KW
350 if (bdrv_find(device_name)) {
351 error_setg(errp, "Device with id '%s' already exists",
352 device_name);
353 return NULL;
354 }
355 if (bdrv_find_node(device_name)) {
d224469d
MA
356 error_setg(errp,
357 "Device name '%s' conflicts with an existing node name",
f2d953ec
KW
358 device_name);
359 return NULL;
360 }
361
e4e9986b
MA
362 bs = bdrv_new();
363
364 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
365 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
366
367 return bs;
368}
369
370BlockDriverState *bdrv_new(void)
371{
372 BlockDriverState *bs;
373 int i;
374
5839e53b 375 bs = g_new0(BlockDriverState, 1);
e4654d2d 376 QLIST_INIT(&bs->dirty_bitmaps);
fbe40ff7
FZ
377 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
378 QLIST_INIT(&bs->op_blockers[i]);
379 }
28a7282a 380 bdrv_iostatus_disable(bs);
d7d512f6 381 notifier_list_init(&bs->close_notifiers);
d616b224 382 notifier_with_return_list_init(&bs->before_write_notifiers);
cc0681c4
BC
383 qemu_co_queue_init(&bs->throttled_reqs[0]);
384 qemu_co_queue_init(&bs->throttled_reqs[1]);
9fcb0251 385 bs->refcnt = 1;
dcd04228 386 bs->aio_context = qemu_get_aio_context();
d7d512f6 387
b338082b
FB
388 return bs;
389}
390
d7d512f6
PB
391void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
392{
393 notifier_list_add(&bs->close_notifiers, notify);
394}
395
ea2384d3
FB
396BlockDriver *bdrv_find_format(const char *format_name)
397{
398 BlockDriver *drv1;
8a22f02a
SH
399 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
400 if (!strcmp(drv1->format_name, format_name)) {
ea2384d3 401 return drv1;
8a22f02a 402 }
ea2384d3
FB
403 }
404 return NULL;
405}
406
b64ec4e4 407static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
eb852011 408{
b64ec4e4
FZ
409 static const char *whitelist_rw[] = {
410 CONFIG_BDRV_RW_WHITELIST
411 };
412 static const char *whitelist_ro[] = {
413 CONFIG_BDRV_RO_WHITELIST
eb852011
MA
414 };
415 const char **p;
416
b64ec4e4 417 if (!whitelist_rw[0] && !whitelist_ro[0]) {
eb852011 418 return 1; /* no whitelist, anything goes */
b64ec4e4 419 }
eb852011 420
b64ec4e4 421 for (p = whitelist_rw; *p; p++) {
eb852011
MA
422 if (!strcmp(drv->format_name, *p)) {
423 return 1;
424 }
425 }
b64ec4e4
FZ
426 if (read_only) {
427 for (p = whitelist_ro; *p; p++) {
428 if (!strcmp(drv->format_name, *p)) {
429 return 1;
430 }
431 }
432 }
eb852011
MA
433 return 0;
434}
435
b64ec4e4
FZ
436BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
437 bool read_only)
eb852011
MA
438{
439 BlockDriver *drv = bdrv_find_format(format_name);
b64ec4e4 440 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
eb852011
MA
441}
442
5b7e1542
ZYW
443typedef struct CreateCo {
444 BlockDriver *drv;
445 char *filename;
83d0521a 446 QemuOpts *opts;
5b7e1542 447 int ret;
cc84d90f 448 Error *err;
5b7e1542
ZYW
449} CreateCo;
450
451static void coroutine_fn bdrv_create_co_entry(void *opaque)
452{
cc84d90f
HR
453 Error *local_err = NULL;
454 int ret;
455
5b7e1542
ZYW
456 CreateCo *cco = opaque;
457 assert(cco->drv);
458
c282e1fd 459 ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
84d18f06 460 if (local_err) {
cc84d90f
HR
461 error_propagate(&cco->err, local_err);
462 }
463 cco->ret = ret;
5b7e1542
ZYW
464}
465
0e7e1989 466int bdrv_create(BlockDriver *drv, const char* filename,
83d0521a 467 QemuOpts *opts, Error **errp)
ea2384d3 468{
5b7e1542
ZYW
469 int ret;
470
471 Coroutine *co;
472 CreateCo cco = {
473 .drv = drv,
474 .filename = g_strdup(filename),
83d0521a 475 .opts = opts,
5b7e1542 476 .ret = NOT_DONE,
cc84d90f 477 .err = NULL,
5b7e1542
ZYW
478 };
479
c282e1fd 480 if (!drv->bdrv_create) {
cc84d90f 481 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
80168bff
LC
482 ret = -ENOTSUP;
483 goto out;
5b7e1542
ZYW
484 }
485
486 if (qemu_in_coroutine()) {
487 /* Fast-path if already in coroutine context */
488 bdrv_create_co_entry(&cco);
489 } else {
490 co = qemu_coroutine_create(bdrv_create_co_entry);
491 qemu_coroutine_enter(co, &cco);
492 while (cco.ret == NOT_DONE) {
b47ec2c4 493 aio_poll(qemu_get_aio_context(), true);
5b7e1542
ZYW
494 }
495 }
496
497 ret = cco.ret;
cc84d90f 498 if (ret < 0) {
84d18f06 499 if (cco.err) {
cc84d90f
HR
500 error_propagate(errp, cco.err);
501 } else {
502 error_setg_errno(errp, -ret, "Could not create image");
503 }
504 }
0e7e1989 505
80168bff
LC
506out:
507 g_free(cco.filename);
5b7e1542 508 return ret;
ea2384d3
FB
509}
510
c282e1fd 511int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
84a12e66
CH
512{
513 BlockDriver *drv;
cc84d90f
HR
514 Error *local_err = NULL;
515 int ret;
84a12e66 516
98289620 517 drv = bdrv_find_protocol(filename, true);
84a12e66 518 if (drv == NULL) {
cc84d90f 519 error_setg(errp, "Could not find protocol for file '%s'", filename);
16905d71 520 return -ENOENT;
84a12e66
CH
521 }
522
c282e1fd 523 ret = bdrv_create(drv, filename, opts, &local_err);
84d18f06 524 if (local_err) {
cc84d90f
HR
525 error_propagate(errp, local_err);
526 }
527 return ret;
84a12e66
CH
528}
529
3baca891 530void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
d34682cd
KW
531{
532 BlockDriver *drv = bs->drv;
3baca891 533 Error *local_err = NULL;
d34682cd
KW
534
535 memset(&bs->bl, 0, sizeof(bs->bl));
536
466ad822 537 if (!drv) {
3baca891 538 return;
466ad822
KW
539 }
540
541 /* Take some limits from the children as a default */
542 if (bs->file) {
3baca891
KW
543 bdrv_refresh_limits(bs->file, &local_err);
544 if (local_err) {
545 error_propagate(errp, local_err);
546 return;
547 }
466ad822 548 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
339064d5
KW
549 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
550 } else {
551 bs->bl.opt_mem_alignment = 512;
466ad822
KW
552 }
553
554 if (bs->backing_hd) {
3baca891
KW
555 bdrv_refresh_limits(bs->backing_hd, &local_err);
556 if (local_err) {
557 error_propagate(errp, local_err);
558 return;
559 }
466ad822
KW
560 bs->bl.opt_transfer_length =
561 MAX(bs->bl.opt_transfer_length,
562 bs->backing_hd->bl.opt_transfer_length);
339064d5
KW
563 bs->bl.opt_mem_alignment =
564 MAX(bs->bl.opt_mem_alignment,
565 bs->backing_hd->bl.opt_mem_alignment);
466ad822
KW
566 }
567
568 /* Then let the driver override it */
569 if (drv->bdrv_refresh_limits) {
3baca891 570 drv->bdrv_refresh_limits(bs, errp);
d34682cd 571 }
d34682cd
KW
572}
573
eba25057
JM
574/*
575 * Create a uniquely-named empty temporary file.
576 * Return 0 upon success, otherwise a negative errno value.
577 */
578int get_tmp_filename(char *filename, int size)
d5249393 579{
eba25057 580#ifdef _WIN32
3b9f94e1 581 char temp_dir[MAX_PATH];
eba25057
JM
582 /* GetTempFileName requires that its output buffer (4th param)
583 have length MAX_PATH or greater. */
584 assert(size >= MAX_PATH);
585 return (GetTempPath(MAX_PATH, temp_dir)
586 && GetTempFileName(temp_dir, "qem", 0, filename)
587 ? 0 : -GetLastError());
d5249393 588#else
67b915a5 589 int fd;
7ccfb2eb 590 const char *tmpdir;
0badc1ee 591 tmpdir = getenv("TMPDIR");
69bef793
AS
592 if (!tmpdir) {
593 tmpdir = "/var/tmp";
594 }
eba25057
JM
595 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
596 return -EOVERFLOW;
597 }
ea2384d3 598 fd = mkstemp(filename);
fe235a06
DH
599 if (fd < 0) {
600 return -errno;
601 }
602 if (close(fd) != 0) {
603 unlink(filename);
eba25057
JM
604 return -errno;
605 }
606 return 0;
d5249393 607#endif
eba25057 608}
fc01f7e7 609
84a12e66
CH
610/*
611 * Detect host devices. By convention, /dev/cdrom[N] is always
612 * recognized as a host CDROM.
613 */
614static BlockDriver *find_hdev_driver(const char *filename)
615{
616 int score_max = 0, score;
617 BlockDriver *drv = NULL, *d;
618
619 QLIST_FOREACH(d, &bdrv_drivers, list) {
620 if (d->bdrv_probe_device) {
621 score = d->bdrv_probe_device(filename);
622 if (score > score_max) {
623 score_max = score;
624 drv = d;
625 }
626 }
627 }
628
629 return drv;
630}
631
98289620
KW
632BlockDriver *bdrv_find_protocol(const char *filename,
633 bool allow_protocol_prefix)
83f64091
FB
634{
635 BlockDriver *drv1;
636 char protocol[128];
1cec71e3 637 int len;
83f64091 638 const char *p;
19cb3738 639
66f82cee
KW
640 /* TODO Drivers without bdrv_file_open must be specified explicitly */
641
39508e7a
CH
642 /*
643 * XXX(hch): we really should not let host device detection
644 * override an explicit protocol specification, but moving this
645 * later breaks access to device names with colons in them.
646 * Thanks to the brain-dead persistent naming schemes on udev-
647 * based Linux systems those actually are quite common.
648 */
649 drv1 = find_hdev_driver(filename);
650 if (drv1) {
651 return drv1;
652 }
653
98289620 654 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
39508e7a 655 return bdrv_find_format("file");
84a12e66 656 }
98289620 657
9e0b22f4
SH
658 p = strchr(filename, ':');
659 assert(p != NULL);
1cec71e3
AL
660 len = p - filename;
661 if (len > sizeof(protocol) - 1)
662 len = sizeof(protocol) - 1;
663 memcpy(protocol, filename, len);
664 protocol[len] = '\0';
8a22f02a 665 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
5fafdf24 666 if (drv1->protocol_name &&
8a22f02a 667 !strcmp(drv1->protocol_name, protocol)) {
83f64091 668 return drv1;
8a22f02a 669 }
83f64091
FB
670 }
671 return NULL;
672}
673
f500a6d3 674static int find_image_format(BlockDriverState *bs, const char *filename,
34b5d2c6 675 BlockDriver **pdrv, Error **errp)
f3a5d3f8 676{
f500a6d3 677 int score, score_max;
f3a5d3f8
CH
678 BlockDriver *drv1, *drv;
679 uint8_t buf[2048];
f500a6d3 680 int ret = 0;
f8ea0b00 681
08a00559 682 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
8e895599 683 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
c98ac35d
SW
684 drv = bdrv_find_format("raw");
685 if (!drv) {
34b5d2c6 686 error_setg(errp, "Could not find raw image format");
c98ac35d
SW
687 ret = -ENOENT;
688 }
689 *pdrv = drv;
690 return ret;
1a396859 691 }
f8ea0b00 692
83f64091 693 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
83f64091 694 if (ret < 0) {
34b5d2c6
HR
695 error_setg_errno(errp, -ret, "Could not read image for determining its "
696 "format");
c98ac35d
SW
697 *pdrv = NULL;
698 return ret;
83f64091
FB
699 }
700
ea2384d3 701 score_max = 0;
84a12e66 702 drv = NULL;
8a22f02a 703 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
83f64091
FB
704 if (drv1->bdrv_probe) {
705 score = drv1->bdrv_probe(buf, ret, filename);
706 if (score > score_max) {
707 score_max = score;
708 drv = drv1;
709 }
0849bf08 710 }
fc01f7e7 711 }
c98ac35d 712 if (!drv) {
34b5d2c6
HR
713 error_setg(errp, "Could not determine image format: No compatible "
714 "driver found");
c98ac35d
SW
715 ret = -ENOENT;
716 }
717 *pdrv = drv;
718 return ret;
ea2384d3
FB
719}
720
51762288
SH
721/**
722 * Set the current 'total_sectors' value
65a9bb25 723 * Return 0 on success, -errno on error.
51762288
SH
724 */
725static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
726{
727 BlockDriver *drv = bs->drv;
728
396759ad
NB
729 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
730 if (bs->sg)
731 return 0;
732
51762288
SH
733 /* query actual device if possible, otherwise just trust the hint */
734 if (drv->bdrv_getlength) {
735 int64_t length = drv->bdrv_getlength(bs);
736 if (length < 0) {
737 return length;
738 }
7e382003 739 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
51762288
SH
740 }
741
742 bs->total_sectors = hint;
743 return 0;
744}
745
9e8f1835
PB
746/**
747 * Set open flags for a given discard mode
748 *
749 * Return 0 on success, -1 if the discard mode was invalid.
750 */
751int bdrv_parse_discard_flags(const char *mode, int *flags)
752{
753 *flags &= ~BDRV_O_UNMAP;
754
755 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
756 /* do nothing */
757 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
758 *flags |= BDRV_O_UNMAP;
759 } else {
760 return -1;
761 }
762
763 return 0;
764}
765
c3993cdc
SH
766/**
767 * Set open flags for a given cache mode
768 *
769 * Return 0 on success, -1 if the cache mode was invalid.
770 */
771int bdrv_parse_cache_flags(const char *mode, int *flags)
772{
773 *flags &= ~BDRV_O_CACHE_MASK;
774
775 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
776 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
92196b2f
SH
777 } else if (!strcmp(mode, "directsync")) {
778 *flags |= BDRV_O_NOCACHE;
c3993cdc
SH
779 } else if (!strcmp(mode, "writeback")) {
780 *flags |= BDRV_O_CACHE_WB;
781 } else if (!strcmp(mode, "unsafe")) {
782 *flags |= BDRV_O_CACHE_WB;
783 *flags |= BDRV_O_NO_FLUSH;
784 } else if (!strcmp(mode, "writethrough")) {
785 /* this is the default */
786 } else {
787 return -1;
788 }
789
790 return 0;
791}
792
53fec9d3
SH
793/**
794 * The copy-on-read flag is actually a reference count so multiple users may
795 * use the feature without worrying about clobbering its previous state.
796 * Copy-on-read stays enabled until all users have called to disable it.
797 */
798void bdrv_enable_copy_on_read(BlockDriverState *bs)
799{
800 bs->copy_on_read++;
801}
802
803void bdrv_disable_copy_on_read(BlockDriverState *bs)
804{
805 assert(bs->copy_on_read > 0);
806 bs->copy_on_read--;
807}
808
b1e6fc08
KW
809/*
810 * Returns the flags that a temporary snapshot should get, based on the
811 * originally requested flags (the originally requested image will have flags
812 * like a backing file)
813 */
814static int bdrv_temp_snapshot_flags(int flags)
815{
816 return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
817}
818
0b50cc88
KW
819/*
820 * Returns the flags that bs->file should get, based on the given flags for
821 * the parent BDS
822 */
823static int bdrv_inherited_flags(int flags)
824{
825 /* Enable protocol handling, disable format probing for bs->file */
826 flags |= BDRV_O_PROTOCOL;
827
828 /* Our block drivers take care to send flushes and respect unmap policy,
829 * so we can enable both unconditionally on lower layers. */
830 flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
831
0b50cc88 832 /* Clear flags that only apply to the top layer */
5669b44d 833 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
0b50cc88
KW
834
835 return flags;
836}
837
317fc44e
KW
838/*
839 * Returns the flags that bs->backing_hd should get, based on the given flags
840 * for the parent BDS
841 */
842static int bdrv_backing_flags(int flags)
843{
844 /* backing files always opened read-only */
845 flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
846
847 /* snapshot=on is handled on the top layer */
8bfea15d 848 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
317fc44e
KW
849
850 return flags;
851}
852
7b272452
KW
853static int bdrv_open_flags(BlockDriverState *bs, int flags)
854{
855 int open_flags = flags | BDRV_O_CACHE_WB;
856
857 /*
858 * Clear flags that are internal to the block layer before opening the
859 * image.
860 */
20cca275 861 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
7b272452
KW
862
863 /*
864 * Snapshots should be writable.
865 */
8bfea15d 866 if (flags & BDRV_O_TEMPORARY) {
7b272452
KW
867 open_flags |= BDRV_O_RDWR;
868 }
869
870 return open_flags;
871}
872
636ea370
KW
873static void bdrv_assign_node_name(BlockDriverState *bs,
874 const char *node_name,
875 Error **errp)
6913c0c2
BC
876{
877 if (!node_name) {
636ea370 878 return;
6913c0c2
BC
879 }
880
9aebf3b8 881 /* Check for empty string or invalid characters */
f5bebbbb 882 if (!id_wellformed(node_name)) {
9aebf3b8 883 error_setg(errp, "Invalid node name");
636ea370 884 return;
6913c0c2
BC
885 }
886
0c5e94ee
BC
887 /* takes care of avoiding namespaces collisions */
888 if (bdrv_find(node_name)) {
889 error_setg(errp, "node-name=%s is conflicting with a device id",
890 node_name);
636ea370 891 return;
0c5e94ee
BC
892 }
893
6913c0c2
BC
894 /* takes care of avoiding duplicates node names */
895 if (bdrv_find_node(node_name)) {
896 error_setg(errp, "Duplicate node name");
636ea370 897 return;
6913c0c2
BC
898 }
899
900 /* copy node name into the bs and insert it into the graph list */
901 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
902 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
6913c0c2
BC
903}
904
57915332
KW
905/*
906 * Common part for opening disk images and files
b6ad491a
KW
907 *
908 * Removes all processed options from *options.
57915332 909 */
f500a6d3 910static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
34b5d2c6 911 QDict *options, int flags, BlockDriver *drv, Error **errp)
57915332
KW
912{
913 int ret, open_flags;
035fccdf 914 const char *filename;
6913c0c2 915 const char *node_name = NULL;
34b5d2c6 916 Error *local_err = NULL;
57915332
KW
917
918 assert(drv != NULL);
6405875c 919 assert(bs->file == NULL);
707ff828 920 assert(options != NULL && bs->options != options);
57915332 921
45673671
KW
922 if (file != NULL) {
923 filename = file->filename;
924 } else {
925 filename = qdict_get_try_str(options, "filename");
926 }
927
765003db
KW
928 if (drv->bdrv_needs_filename && !filename) {
929 error_setg(errp, "The '%s' block driver requires a file name",
930 drv->format_name);
931 return -EINVAL;
932 }
933
45673671 934 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
28dcee10 935
6913c0c2 936 node_name = qdict_get_try_str(options, "node-name");
636ea370 937 bdrv_assign_node_name(bs, node_name, &local_err);
0fb6395c 938 if (local_err) {
636ea370
KW
939 error_propagate(errp, local_err);
940 return -EINVAL;
6913c0c2
BC
941 }
942 qdict_del(options, "node-name");
943
5d186eb0
KW
944 /* bdrv_open() with directly using a protocol as drv. This layer is already
945 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
946 * and return immediately. */
947 if (file != NULL && drv->bdrv_file_open) {
948 bdrv_swap(file, bs);
949 return 0;
950 }
951
57915332 952 bs->open_flags = flags;
1b7fd729 953 bs->guest_block_size = 512;
c25f53b0 954 bs->request_alignment = 512;
0d51b4de 955 bs->zero_beyond_eof = true;
b64ec4e4
FZ
956 open_flags = bdrv_open_flags(bs, flags);
957 bs->read_only = !(open_flags & BDRV_O_RDWR);
20cca275 958 bs->growable = !!(flags & BDRV_O_PROTOCOL);
b64ec4e4
FZ
959
960 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
8f94a6e4
KW
961 error_setg(errp,
962 !bs->read_only && bdrv_is_whitelisted(drv, true)
963 ? "Driver '%s' can only be used for read-only devices"
964 : "Driver '%s' is not whitelisted",
965 drv->format_name);
b64ec4e4
FZ
966 return -ENOTSUP;
967 }
57915332 968
53fec9d3 969 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
0ebd24e0
KW
970 if (flags & BDRV_O_COPY_ON_READ) {
971 if (!bs->read_only) {
972 bdrv_enable_copy_on_read(bs);
973 } else {
974 error_setg(errp, "Can't use copy-on-read on read-only device");
975 return -EINVAL;
976 }
53fec9d3
SH
977 }
978
c2ad1b0c
KW
979 if (filename != NULL) {
980 pstrcpy(bs->filename, sizeof(bs->filename), filename);
981 } else {
982 bs->filename[0] = '\0';
983 }
91af7014 984 pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
57915332 985
57915332 986 bs->drv = drv;
7267c094 987 bs->opaque = g_malloc0(drv->instance_size);
57915332 988
03f541bd 989 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
e7c63796 990
66f82cee
KW
991 /* Open the image, either directly or using a protocol */
992 if (drv->bdrv_file_open) {
5d186eb0 993 assert(file == NULL);
030be321 994 assert(!drv->bdrv_needs_filename || filename != NULL);
34b5d2c6 995 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
f500a6d3 996 } else {
2af5ef70 997 if (file == NULL) {
34b5d2c6
HR
998 error_setg(errp, "Can't use '%s' as a block driver for the "
999 "protocol level", drv->format_name);
2af5ef70
KW
1000 ret = -EINVAL;
1001 goto free_and_fail;
1002 }
f500a6d3 1003 bs->file = file;
34b5d2c6 1004 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
66f82cee
KW
1005 }
1006
57915332 1007 if (ret < 0) {
84d18f06 1008 if (local_err) {
34b5d2c6 1009 error_propagate(errp, local_err);
2fa9aa59
DH
1010 } else if (bs->filename[0]) {
1011 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
34b5d2c6
HR
1012 } else {
1013 error_setg_errno(errp, -ret, "Could not open image");
1014 }
57915332
KW
1015 goto free_and_fail;
1016 }
1017
51762288
SH
1018 ret = refresh_total_sectors(bs, bs->total_sectors);
1019 if (ret < 0) {
34b5d2c6 1020 error_setg_errno(errp, -ret, "Could not refresh total sector count");
51762288 1021 goto free_and_fail;
57915332 1022 }
51762288 1023
3baca891
KW
1024 bdrv_refresh_limits(bs, &local_err);
1025 if (local_err) {
1026 error_propagate(errp, local_err);
1027 ret = -EINVAL;
1028 goto free_and_fail;
1029 }
1030
c25f53b0 1031 assert(bdrv_opt_mem_align(bs) != 0);
47ea2de2 1032 assert((bs->request_alignment != 0) || bs->sg);
57915332
KW
1033 return 0;
1034
1035free_and_fail:
f500a6d3 1036 bs->file = NULL;
7267c094 1037 g_free(bs->opaque);
57915332
KW
1038 bs->opaque = NULL;
1039 bs->drv = NULL;
1040 return ret;
1041}
1042
5e5c4f63
KW
1043static QDict *parse_json_filename(const char *filename, Error **errp)
1044{
1045 QObject *options_obj;
1046 QDict *options;
1047 int ret;
1048
1049 ret = strstart(filename, "json:", &filename);
1050 assert(ret);
1051
1052 options_obj = qobject_from_json(filename);
1053 if (!options_obj) {
1054 error_setg(errp, "Could not parse the JSON options");
1055 return NULL;
1056 }
1057
1058 if (qobject_type(options_obj) != QTYPE_QDICT) {
1059 qobject_decref(options_obj);
1060 error_setg(errp, "Invalid JSON object given");
1061 return NULL;
1062 }
1063
1064 options = qobject_to_qdict(options_obj);
1065 qdict_flatten(options);
1066
1067 return options;
1068}
1069
b6ce07aa 1070/*
f54120ff
KW
1071 * Fills in default options for opening images and converts the legacy
1072 * filename/flags pair to option QDict entries.
b6ce07aa 1073 */
5e5c4f63 1074static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
17b005f1 1075 BlockDriver *drv, Error **errp)
ea2384d3 1076{
5e5c4f63 1077 const char *filename = *pfilename;
c2ad1b0c 1078 const char *drvname;
462f5bcf 1079 bool protocol = flags & BDRV_O_PROTOCOL;
e3fa4bfa 1080 bool parse_filename = false;
34b5d2c6 1081 Error *local_err = NULL;
83f64091 1082
5e5c4f63
KW
1083 /* Parse json: pseudo-protocol */
1084 if (filename && g_str_has_prefix(filename, "json:")) {
1085 QDict *json_options = parse_json_filename(filename, &local_err);
1086 if (local_err) {
1087 error_propagate(errp, local_err);
1088 return -EINVAL;
1089 }
1090
1091 /* Options given in the filename have lower priority than options
1092 * specified directly */
1093 qdict_join(*options, json_options, false);
1094 QDECREF(json_options);
1095 *pfilename = filename = NULL;
1096 }
1097
035fccdf 1098 /* Fetch the file name from the options QDict if necessary */
17b005f1 1099 if (protocol && filename) {
f54120ff
KW
1100 if (!qdict_haskey(*options, "filename")) {
1101 qdict_put(*options, "filename", qstring_from_str(filename));
1102 parse_filename = true;
1103 } else {
1104 error_setg(errp, "Can't specify 'file' and 'filename' options at "
1105 "the same time");
1106 return -EINVAL;
1107 }
035fccdf
KW
1108 }
1109
c2ad1b0c 1110 /* Find the right block driver */
f54120ff 1111 filename = qdict_get_try_str(*options, "filename");
5acd9d81 1112 drvname = qdict_get_try_str(*options, "driver");
f54120ff 1113
17b005f1
KW
1114 if (drv) {
1115 if (drvname) {
1116 error_setg(errp, "Driver specified twice");
1117 return -EINVAL;
1118 }
1119 drvname = drv->format_name;
1120 qdict_put(*options, "driver", qstring_from_str(drvname));
1121 } else {
1122 if (!drvname && protocol) {
1123 if (filename) {
1124 drv = bdrv_find_protocol(filename, parse_filename);
1125 if (!drv) {
1126 error_setg(errp, "Unknown protocol");
1127 return -EINVAL;
1128 }
1129
1130 drvname = drv->format_name;
1131 qdict_put(*options, "driver", qstring_from_str(drvname));
1132 } else {
1133 error_setg(errp, "Must specify either driver or file");
f54120ff
KW
1134 return -EINVAL;
1135 }
17b005f1
KW
1136 } else if (drvname) {
1137 drv = bdrv_find_format(drvname);
1138 if (!drv) {
1139 error_setg(errp, "Unknown driver '%s'", drvname);
1140 return -ENOENT;
1141 }
98289620 1142 }
c2ad1b0c
KW
1143 }
1144
17b005f1 1145 assert(drv || !protocol);
c2ad1b0c 1146
f54120ff 1147 /* Driver-specific filename parsing */
17b005f1 1148 if (drv && drv->bdrv_parse_filename && parse_filename) {
5acd9d81 1149 drv->bdrv_parse_filename(filename, *options, &local_err);
84d18f06 1150 if (local_err) {
34b5d2c6 1151 error_propagate(errp, local_err);
f54120ff 1152 return -EINVAL;
6963a30d 1153 }
cd5d031e
HR
1154
1155 if (!drv->bdrv_needs_filename) {
1156 qdict_del(*options, "filename");
cd5d031e 1157 }
6963a30d
KW
1158 }
1159
f54120ff
KW
1160 return 0;
1161}
1162
8d24cce1
FZ
1163void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1164{
1165
826b6ca0
FZ
1166 if (bs->backing_hd) {
1167 assert(bs->backing_blocker);
1168 bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1169 } else if (backing_hd) {
1170 error_setg(&bs->backing_blocker,
1171 "device is used as backing hd of '%s'",
1172 bs->device_name);
1173 }
1174
8d24cce1
FZ
1175 bs->backing_hd = backing_hd;
1176 if (!backing_hd) {
826b6ca0
FZ
1177 error_free(bs->backing_blocker);
1178 bs->backing_blocker = NULL;
8d24cce1
FZ
1179 goto out;
1180 }
1181 bs->open_flags &= ~BDRV_O_NO_BACKING;
1182 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1183 pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1184 backing_hd->drv ? backing_hd->drv->format_name : "");
826b6ca0
FZ
1185
1186 bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1187 /* Otherwise we won't be able to commit due to check in bdrv_commit */
1188 bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
1189 bs->backing_blocker);
8d24cce1 1190out:
3baca891 1191 bdrv_refresh_limits(bs, NULL);
8d24cce1
FZ
1192}
1193
31ca6d07
KW
1194/*
1195 * Opens the backing file for a BlockDriverState if not yet open
1196 *
1197 * options is a QDict of options to pass to the block drivers, or NULL for an
1198 * empty set of options. The reference to the QDict is transferred to this
1199 * function (even on failure), so if the caller intends to reuse the dictionary,
1200 * it needs to use QINCREF() before calling bdrv_file_open.
1201 */
34b5d2c6 1202int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
9156df12 1203{
1ba4b6a5 1204 char *backing_filename = g_malloc0(PATH_MAX);
317fc44e 1205 int ret = 0;
9156df12 1206 BlockDriver *back_drv = NULL;
8d24cce1 1207 BlockDriverState *backing_hd;
34b5d2c6 1208 Error *local_err = NULL;
9156df12
PB
1209
1210 if (bs->backing_hd != NULL) {
31ca6d07 1211 QDECREF(options);
1ba4b6a5 1212 goto free_exit;
9156df12
PB
1213 }
1214
31ca6d07
KW
1215 /* NULL means an empty set of options */
1216 if (options == NULL) {
1217 options = qdict_new();
1218 }
1219
9156df12 1220 bs->open_flags &= ~BDRV_O_NO_BACKING;
1cb6f506
KW
1221 if (qdict_haskey(options, "file.filename")) {
1222 backing_filename[0] = '\0';
1223 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
31ca6d07 1224 QDECREF(options);
1ba4b6a5 1225 goto free_exit;
dbecebdd 1226 } else {
1ba4b6a5 1227 bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
9156df12
PB
1228 }
1229
8ee79e70
KW
1230 if (!bs->drv || !bs->drv->supports_backing) {
1231 ret = -EINVAL;
1232 error_setg(errp, "Driver doesn't support backing files");
1233 QDECREF(options);
1234 goto free_exit;
1235 }
1236
e4e9986b 1237 backing_hd = bdrv_new();
8d24cce1 1238
9156df12
PB
1239 if (bs->backing_format[0] != '\0') {
1240 back_drv = bdrv_find_format(bs->backing_format);
1241 }
1242
f67503e5 1243 assert(bs->backing_hd == NULL);
8d24cce1 1244 ret = bdrv_open(&backing_hd,
ddf5636d 1245 *backing_filename ? backing_filename : NULL, NULL, options,
317fc44e 1246 bdrv_backing_flags(bs->open_flags), back_drv, &local_err);
9156df12 1247 if (ret < 0) {
8d24cce1
FZ
1248 bdrv_unref(backing_hd);
1249 backing_hd = NULL;
9156df12 1250 bs->open_flags |= BDRV_O_NO_BACKING;
b04b6b6e
FZ
1251 error_setg(errp, "Could not open backing file: %s",
1252 error_get_pretty(local_err));
1253 error_free(local_err);
1ba4b6a5 1254 goto free_exit;
9156df12 1255 }
8d24cce1 1256 bdrv_set_backing_hd(bs, backing_hd);
d80ac658 1257
1ba4b6a5
BC
1258free_exit:
1259 g_free(backing_filename);
1260 return ret;
9156df12
PB
1261}
1262
da557aac
HR
1263/*
1264 * Opens a disk image whose options are given as BlockdevRef in another block
1265 * device's options.
1266 *
da557aac
HR
1267 * If allow_none is true, no image will be opened if filename is false and no
1268 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1269 *
1270 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1271 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1272 * itself, all options starting with "${bdref_key}." are considered part of the
1273 * BlockdevRef.
1274 *
1275 * The BlockdevRef will be removed from the options QDict.
f67503e5
HR
1276 *
1277 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
da557aac
HR
1278 */
1279int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1280 QDict *options, const char *bdref_key, int flags,
f7d9fd8c 1281 bool allow_none, Error **errp)
da557aac
HR
1282{
1283 QDict *image_options;
1284 int ret;
1285 char *bdref_key_dot;
1286 const char *reference;
1287
f67503e5
HR
1288 assert(pbs);
1289 assert(*pbs == NULL);
1290
da557aac
HR
1291 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1292 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1293 g_free(bdref_key_dot);
1294
1295 reference = qdict_get_try_str(options, bdref_key);
1296 if (!filename && !reference && !qdict_size(image_options)) {
1297 if (allow_none) {
1298 ret = 0;
1299 } else {
1300 error_setg(errp, "A block device must be specified for \"%s\"",
1301 bdref_key);
1302 ret = -EINVAL;
1303 }
b20e61e0 1304 QDECREF(image_options);
da557aac
HR
1305 goto done;
1306 }
1307
f7d9fd8c 1308 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
da557aac
HR
1309
1310done:
1311 qdict_del(options, bdref_key);
1312 return ret;
1313}
1314
6b8aeca5 1315int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
b998875d
KW
1316{
1317 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1ba4b6a5 1318 char *tmp_filename = g_malloc0(PATH_MAX + 1);
b998875d
KW
1319 int64_t total_size;
1320 BlockDriver *bdrv_qcow2;
83d0521a 1321 QemuOpts *opts = NULL;
b998875d
KW
1322 QDict *snapshot_options;
1323 BlockDriverState *bs_snapshot;
1324 Error *local_err;
1325 int ret;
1326
1327 /* if snapshot, we create a temporary backing file and open it
1328 instead of opening 'filename' directly */
1329
1330 /* Get the required size from the image */
f187743a
KW
1331 total_size = bdrv_getlength(bs);
1332 if (total_size < 0) {
6b8aeca5 1333 ret = total_size;
f187743a 1334 error_setg_errno(errp, -total_size, "Could not get image size");
1ba4b6a5 1335 goto out;
f187743a 1336 }
b998875d
KW
1337
1338 /* Create the temporary image */
1ba4b6a5 1339 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
b998875d
KW
1340 if (ret < 0) {
1341 error_setg_errno(errp, -ret, "Could not get temporary filename");
1ba4b6a5 1342 goto out;
b998875d
KW
1343 }
1344
1345 bdrv_qcow2 = bdrv_find_format("qcow2");
c282e1fd
CL
1346 opts = qemu_opts_create(bdrv_qcow2->create_opts, NULL, 0,
1347 &error_abort);
83d0521a 1348 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size);
c282e1fd 1349 ret = bdrv_create(bdrv_qcow2, tmp_filename, opts, &local_err);
83d0521a 1350 qemu_opts_del(opts);
b998875d
KW
1351 if (ret < 0) {
1352 error_setg_errno(errp, -ret, "Could not create temporary overlay "
1353 "'%s': %s", tmp_filename,
1354 error_get_pretty(local_err));
1355 error_free(local_err);
1ba4b6a5 1356 goto out;
b998875d
KW
1357 }
1358
1359 /* Prepare a new options QDict for the temporary file */
1360 snapshot_options = qdict_new();
1361 qdict_put(snapshot_options, "file.driver",
1362 qstring_from_str("file"));
1363 qdict_put(snapshot_options, "file.filename",
1364 qstring_from_str(tmp_filename));
1365
e4e9986b 1366 bs_snapshot = bdrv_new();
b998875d
KW
1367
1368 ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
b1e6fc08 1369 flags, bdrv_qcow2, &local_err);
b998875d
KW
1370 if (ret < 0) {
1371 error_propagate(errp, local_err);
1ba4b6a5 1372 goto out;
b998875d
KW
1373 }
1374
1375 bdrv_append(bs_snapshot, bs);
1ba4b6a5
BC
1376
1377out:
1378 g_free(tmp_filename);
6b8aeca5 1379 return ret;
b998875d
KW
1380}
1381
b6ce07aa
KW
1382/*
1383 * Opens a disk image (raw, qcow2, vmdk, ...)
de9c0cec
KW
1384 *
1385 * options is a QDict of options to pass to the block drivers, or NULL for an
1386 * empty set of options. The reference to the QDict belongs to the block layer
1387 * after the call (even on failure), so if the caller intends to reuse the
1388 * dictionary, it needs to use QINCREF() before calling bdrv_open.
f67503e5
HR
1389 *
1390 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1391 * If it is not NULL, the referenced BDS will be reused.
ddf5636d
HR
1392 *
1393 * The reference parameter may be used to specify an existing block device which
1394 * should be opened. If specified, neither options nor a filename may be given,
1395 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
b6ce07aa 1396 */
ddf5636d
HR
1397int bdrv_open(BlockDriverState **pbs, const char *filename,
1398 const char *reference, QDict *options, int flags,
1399 BlockDriver *drv, Error **errp)
ea2384d3 1400{
b6ce07aa 1401 int ret;
f67503e5 1402 BlockDriverState *file = NULL, *bs;
74fe54f2 1403 const char *drvname;
34b5d2c6 1404 Error *local_err = NULL;
b1e6fc08 1405 int snapshot_flags = 0;
712e7874 1406
f67503e5
HR
1407 assert(pbs);
1408
ddf5636d
HR
1409 if (reference) {
1410 bool options_non_empty = options ? qdict_size(options) : false;
1411 QDECREF(options);
1412
1413 if (*pbs) {
1414 error_setg(errp, "Cannot reuse an existing BDS when referencing "
1415 "another block device");
1416 return -EINVAL;
1417 }
1418
1419 if (filename || options_non_empty) {
1420 error_setg(errp, "Cannot reference an existing block device with "
1421 "additional options or a new filename");
1422 return -EINVAL;
1423 }
1424
1425 bs = bdrv_lookup_bs(reference, reference, errp);
1426 if (!bs) {
1427 return -ENODEV;
1428 }
1429 bdrv_ref(bs);
1430 *pbs = bs;
1431 return 0;
1432 }
1433
f67503e5
HR
1434 if (*pbs) {
1435 bs = *pbs;
1436 } else {
e4e9986b 1437 bs = bdrv_new();
f67503e5
HR
1438 }
1439
de9c0cec
KW
1440 /* NULL means an empty set of options */
1441 if (options == NULL) {
1442 options = qdict_new();
1443 }
1444
17b005f1 1445 ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
462f5bcf
KW
1446 if (local_err) {
1447 goto fail;
1448 }
1449
76c591b0
KW
1450 /* Find the right image format driver */
1451 drv = NULL;
1452 drvname = qdict_get_try_str(options, "driver");
1453 if (drvname) {
1454 drv = bdrv_find_format(drvname);
1455 qdict_del(options, "driver");
1456 if (!drv) {
1457 error_setg(errp, "Unknown driver: '%s'", drvname);
1458 ret = -EINVAL;
1459 goto fail;
1460 }
1461 }
1462
1463 assert(drvname || !(flags & BDRV_O_PROTOCOL));
1464 if (drv && !drv->bdrv_file_open) {
1465 /* If the user explicitly wants a format driver here, we'll need to add
1466 * another layer for the protocol in bs->file */
1467 flags &= ~BDRV_O_PROTOCOL;
1468 }
1469
de9c0cec 1470 bs->options = options;
b6ad491a 1471 options = qdict_clone_shallow(options);
de9c0cec 1472
f500a6d3 1473 /* Open image file without format layer */
f4788adc
KW
1474 if ((flags & BDRV_O_PROTOCOL) == 0) {
1475 if (flags & BDRV_O_RDWR) {
1476 flags |= BDRV_O_ALLOW_RDWR;
1477 }
1478 if (flags & BDRV_O_SNAPSHOT) {
1479 snapshot_flags = bdrv_temp_snapshot_flags(flags);
1480 flags = bdrv_backing_flags(flags);
1481 }
f500a6d3 1482
f4788adc
KW
1483 assert(file == NULL);
1484 ret = bdrv_open_image(&file, filename, options, "file",
1485 bdrv_inherited_flags(flags),
1486 true, &local_err);
1487 if (ret < 0) {
1488 goto fail;
1489 }
f500a6d3
KW
1490 }
1491
76c591b0
KW
1492 /* Image format probing */
1493 if (!drv && file) {
17b005f1
KW
1494 ret = find_image_format(file, filename, &drv, &local_err);
1495 if (ret < 0) {
8bfea15d 1496 goto fail;
2a05cbe4 1497 }
76c591b0 1498 } else if (!drv) {
17b005f1
KW
1499 error_setg(errp, "Must specify either driver or file");
1500 ret = -EINVAL;
8bfea15d 1501 goto fail;
ea2384d3 1502 }
b6ce07aa
KW
1503
1504 /* Open the image */
34b5d2c6 1505 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
b6ce07aa 1506 if (ret < 0) {
8bfea15d 1507 goto fail;
6987307c
CH
1508 }
1509
2a05cbe4 1510 if (file && (bs->file != file)) {
4f6fd349 1511 bdrv_unref(file);
f500a6d3
KW
1512 file = NULL;
1513 }
1514
b6ce07aa 1515 /* If there is a backing file, use it */
9156df12 1516 if ((flags & BDRV_O_NO_BACKING) == 0) {
31ca6d07
KW
1517 QDict *backing_options;
1518
5726d872 1519 qdict_extract_subqdict(options, &backing_options, "backing.");
34b5d2c6 1520 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
b6ce07aa 1521 if (ret < 0) {
b6ad491a 1522 goto close_and_fail;
b6ce07aa 1523 }
b6ce07aa
KW
1524 }
1525
91af7014
HR
1526 bdrv_refresh_filename(bs);
1527
b998875d
KW
1528 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1529 * temporary snapshot afterwards. */
b1e6fc08 1530 if (snapshot_flags) {
6b8aeca5 1531 ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
b998875d 1532 if (local_err) {
b998875d
KW
1533 goto close_and_fail;
1534 }
1535 }
1536
b6ad491a 1537 /* Check if any unknown options were used */
5acd9d81 1538 if (options && (qdict_size(options) != 0)) {
b6ad491a 1539 const QDictEntry *entry = qdict_first(options);
5acd9d81
HR
1540 if (flags & BDRV_O_PROTOCOL) {
1541 error_setg(errp, "Block protocol '%s' doesn't support the option "
1542 "'%s'", drv->format_name, entry->key);
1543 } else {
1544 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1545 "support the option '%s'", drv->format_name,
1546 bs->device_name, entry->key);
1547 }
b6ad491a
KW
1548
1549 ret = -EINVAL;
1550 goto close_and_fail;
1551 }
b6ad491a 1552
b6ce07aa 1553 if (!bdrv_key_required(bs)) {
7d4b4ba5 1554 bdrv_dev_change_media_cb(bs, true);
c3adb58f
MA
1555 } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1556 && !runstate_check(RUN_STATE_INMIGRATE)
1557 && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1558 error_setg(errp,
1559 "Guest must be stopped for opening of encrypted image");
1560 ret = -EBUSY;
1561 goto close_and_fail;
b6ce07aa
KW
1562 }
1563
c3adb58f 1564 QDECREF(options);
f67503e5 1565 *pbs = bs;
b6ce07aa
KW
1566 return 0;
1567
8bfea15d 1568fail:
f500a6d3 1569 if (file != NULL) {
4f6fd349 1570 bdrv_unref(file);
f500a6d3 1571 }
de9c0cec 1572 QDECREF(bs->options);
b6ad491a 1573 QDECREF(options);
de9c0cec 1574 bs->options = NULL;
f67503e5
HR
1575 if (!*pbs) {
1576 /* If *pbs is NULL, a new BDS has been created in this function and
1577 needs to be freed now. Otherwise, it does not need to be closed,
1578 since it has not really been opened yet. */
1579 bdrv_unref(bs);
1580 }
84d18f06 1581 if (local_err) {
34b5d2c6
HR
1582 error_propagate(errp, local_err);
1583 }
b6ad491a 1584 return ret;
de9c0cec 1585
b6ad491a 1586close_and_fail:
f67503e5
HR
1587 /* See fail path, but now the BDS has to be always closed */
1588 if (*pbs) {
1589 bdrv_close(bs);
1590 } else {
1591 bdrv_unref(bs);
1592 }
b6ad491a 1593 QDECREF(options);
84d18f06 1594 if (local_err) {
34b5d2c6
HR
1595 error_propagate(errp, local_err);
1596 }
b6ce07aa
KW
1597 return ret;
1598}
1599
e971aa12
JC
1600typedef struct BlockReopenQueueEntry {
1601 bool prepared;
1602 BDRVReopenState state;
1603 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1604} BlockReopenQueueEntry;
1605
1606/*
1607 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1608 * reopen of multiple devices.
1609 *
1610 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1611 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1612 * be created and initialized. This newly created BlockReopenQueue should be
1613 * passed back in for subsequent calls that are intended to be of the same
1614 * atomic 'set'.
1615 *
1616 * bs is the BlockDriverState to add to the reopen queue.
1617 *
1618 * flags contains the open flags for the associated bs
1619 *
1620 * returns a pointer to bs_queue, which is either the newly allocated
1621 * bs_queue, or the existing bs_queue being used.
1622 *
1623 */
1624BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1625 BlockDriverState *bs, int flags)
1626{
1627 assert(bs != NULL);
1628
1629 BlockReopenQueueEntry *bs_entry;
1630 if (bs_queue == NULL) {
1631 bs_queue = g_new0(BlockReopenQueue, 1);
1632 QSIMPLEQ_INIT(bs_queue);
1633 }
1634
f1f25a2e
KW
1635 /* bdrv_open() masks this flag out */
1636 flags &= ~BDRV_O_PROTOCOL;
1637
e971aa12 1638 if (bs->file) {
f1f25a2e 1639 bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
e971aa12
JC
1640 }
1641
1642 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1643 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1644
1645 bs_entry->state.bs = bs;
1646 bs_entry->state.flags = flags;
1647
1648 return bs_queue;
1649}
1650
1651/*
1652 * Reopen multiple BlockDriverStates atomically & transactionally.
1653 *
1654 * The queue passed in (bs_queue) must have been built up previous
1655 * via bdrv_reopen_queue().
1656 *
1657 * Reopens all BDS specified in the queue, with the appropriate
1658 * flags. All devices are prepared for reopen, and failure of any
1659 * device will cause all device changes to be abandonded, and intermediate
1660 * data cleaned up.
1661 *
1662 * If all devices prepare successfully, then the changes are committed
1663 * to all devices.
1664 *
1665 */
1666int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1667{
1668 int ret = -1;
1669 BlockReopenQueueEntry *bs_entry, *next;
1670 Error *local_err = NULL;
1671
1672 assert(bs_queue != NULL);
1673
1674 bdrv_drain_all();
1675
1676 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1677 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1678 error_propagate(errp, local_err);
1679 goto cleanup;
1680 }
1681 bs_entry->prepared = true;
1682 }
1683
1684 /* If we reach this point, we have success and just need to apply the
1685 * changes
1686 */
1687 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1688 bdrv_reopen_commit(&bs_entry->state);
1689 }
1690
1691 ret = 0;
1692
1693cleanup:
1694 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1695 if (ret && bs_entry->prepared) {
1696 bdrv_reopen_abort(&bs_entry->state);
1697 }
1698 g_free(bs_entry);
1699 }
1700 g_free(bs_queue);
1701 return ret;
1702}
1703
1704
1705/* Reopen a single BlockDriverState with the specified flags. */
1706int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1707{
1708 int ret = -1;
1709 Error *local_err = NULL;
1710 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1711
1712 ret = bdrv_reopen_multiple(queue, &local_err);
1713 if (local_err != NULL) {
1714 error_propagate(errp, local_err);
1715 }
1716 return ret;
1717}
1718
1719
1720/*
1721 * Prepares a BlockDriverState for reopen. All changes are staged in the
1722 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1723 * the block driver layer .bdrv_reopen_prepare()
1724 *
1725 * bs is the BlockDriverState to reopen
1726 * flags are the new open flags
1727 * queue is the reopen queue
1728 *
1729 * Returns 0 on success, non-zero on error. On error errp will be set
1730 * as well.
1731 *
1732 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1733 * It is the responsibility of the caller to then call the abort() or
1734 * commit() for any other BDS that have been left in a prepare() state
1735 *
1736 */
1737int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1738 Error **errp)
1739{
1740 int ret = -1;
1741 Error *local_err = NULL;
1742 BlockDriver *drv;
1743
1744 assert(reopen_state != NULL);
1745 assert(reopen_state->bs->drv != NULL);
1746 drv = reopen_state->bs->drv;
1747
1748 /* if we are to stay read-only, do not allow permission change
1749 * to r/w */
1750 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1751 reopen_state->flags & BDRV_O_RDWR) {
1752 error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1753 reopen_state->bs->device_name);
1754 goto error;
1755 }
1756
1757
1758 ret = bdrv_flush(reopen_state->bs);
1759 if (ret) {
1760 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1761 strerror(-ret));
1762 goto error;
1763 }
1764
1765 if (drv->bdrv_reopen_prepare) {
1766 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1767 if (ret) {
1768 if (local_err != NULL) {
1769 error_propagate(errp, local_err);
1770 } else {
d8b6895f
LC
1771 error_setg(errp, "failed while preparing to reopen image '%s'",
1772 reopen_state->bs->filename);
e971aa12
JC
1773 }
1774 goto error;
1775 }
1776 } else {
1777 /* It is currently mandatory to have a bdrv_reopen_prepare()
1778 * handler for each supported drv. */
1779 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1780 drv->format_name, reopen_state->bs->device_name,
1781 "reopening of file");
1782 ret = -1;
1783 goto error;
1784 }
1785
1786 ret = 0;
1787
1788error:
1789 return ret;
1790}
1791
1792/*
1793 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1794 * makes them final by swapping the staging BlockDriverState contents into
1795 * the active BlockDriverState contents.
1796 */
1797void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1798{
1799 BlockDriver *drv;
1800
1801 assert(reopen_state != NULL);
1802 drv = reopen_state->bs->drv;
1803 assert(drv != NULL);
1804
1805 /* If there are any driver level actions to take */
1806 if (drv->bdrv_reopen_commit) {
1807 drv->bdrv_reopen_commit(reopen_state);
1808 }
1809
1810 /* set BDS specific flags now */
1811 reopen_state->bs->open_flags = reopen_state->flags;
1812 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1813 BDRV_O_CACHE_WB);
1814 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
355ef4ac 1815
3baca891 1816 bdrv_refresh_limits(reopen_state->bs, NULL);
e971aa12
JC
1817}
1818
1819/*
1820 * Abort the reopen, and delete and free the staged changes in
1821 * reopen_state
1822 */
1823void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1824{
1825 BlockDriver *drv;
1826
1827 assert(reopen_state != NULL);
1828 drv = reopen_state->bs->drv;
1829 assert(drv != NULL);
1830
1831 if (drv->bdrv_reopen_abort) {
1832 drv->bdrv_reopen_abort(reopen_state);
1833 }
1834}
1835
1836
fc01f7e7
FB
1837void bdrv_close(BlockDriverState *bs)
1838{
33384421
HR
1839 BdrvAioNotifier *ban, *ban_next;
1840
3cbc002c
PB
1841 if (bs->job) {
1842 block_job_cancel_sync(bs->job);
1843 }
58fda173
SH
1844 bdrv_drain_all(); /* complete I/O */
1845 bdrv_flush(bs);
1846 bdrv_drain_all(); /* in case flush left pending I/O */
d7d512f6 1847 notifier_list_notify(&bs->close_notifiers, bs);
7094f12f 1848
3cbc002c 1849 if (bs->drv) {
557df6ac 1850 if (bs->backing_hd) {
826b6ca0
FZ
1851 BlockDriverState *backing_hd = bs->backing_hd;
1852 bdrv_set_backing_hd(bs, NULL);
1853 bdrv_unref(backing_hd);
557df6ac 1854 }
ea2384d3 1855 bs->drv->bdrv_close(bs);
7267c094 1856 g_free(bs->opaque);
ea2384d3
FB
1857 bs->opaque = NULL;
1858 bs->drv = NULL;
53fec9d3 1859 bs->copy_on_read = 0;
a275fa42
PB
1860 bs->backing_file[0] = '\0';
1861 bs->backing_format[0] = '\0';
6405875c
PB
1862 bs->total_sectors = 0;
1863 bs->encrypted = 0;
1864 bs->valid_key = 0;
1865 bs->sg = 0;
1866 bs->growable = 0;
0d51b4de 1867 bs->zero_beyond_eof = false;
de9c0cec
KW
1868 QDECREF(bs->options);
1869 bs->options = NULL;
91af7014
HR
1870 QDECREF(bs->full_open_options);
1871 bs->full_open_options = NULL;
b338082b 1872
66f82cee 1873 if (bs->file != NULL) {
4f6fd349 1874 bdrv_unref(bs->file);
0ac9377d 1875 bs->file = NULL;
66f82cee 1876 }
b338082b 1877 }
98f90dba 1878
9ca11154
PH
1879 bdrv_dev_change_media_cb(bs, false);
1880
98f90dba
ZYW
1881 /*throttling disk I/O limits*/
1882 if (bs->io_limits_enabled) {
1883 bdrv_io_limits_disable(bs);
1884 }
33384421
HR
1885
1886 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1887 g_free(ban);
1888 }
1889 QLIST_INIT(&bs->aio_notifiers);
b338082b
FB
1890}
1891
2bc93fed
MK
1892void bdrv_close_all(void)
1893{
1894 BlockDriverState *bs;
1895
dc364f4c 1896 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
1897 AioContext *aio_context = bdrv_get_aio_context(bs);
1898
1899 aio_context_acquire(aio_context);
2bc93fed 1900 bdrv_close(bs);
ed78cda3 1901 aio_context_release(aio_context);
2bc93fed
MK
1902 }
1903}
1904
88266f5a
SH
1905/* Check if any requests are in-flight (including throttled requests) */
1906static bool bdrv_requests_pending(BlockDriverState *bs)
1907{
1908 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1909 return true;
1910 }
cc0681c4
BC
1911 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1912 return true;
1913 }
1914 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
88266f5a
SH
1915 return true;
1916 }
1917 if (bs->file && bdrv_requests_pending(bs->file)) {
1918 return true;
1919 }
1920 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1921 return true;
1922 }
1923 return false;
1924}
1925
922453bc
SH
1926/*
1927 * Wait for pending requests to complete across all BlockDriverStates
1928 *
1929 * This function does not flush data to disk, use bdrv_flush_all() for that
1930 * after calling this function.
4c355d53
ZYW
1931 *
1932 * Note that completion of an asynchronous I/O operation can trigger any
1933 * number of other I/O operations on other devices---for example a coroutine
1934 * can be arbitrarily complex and a constant flow of I/O can come until the
1935 * coroutine is complete. Because of this, it is not possible to have a
1936 * function to drain a single device's I/O queue.
922453bc
SH
1937 */
1938void bdrv_drain_all(void)
1939{
88266f5a
SH
1940 /* Always run first iteration so any pending completion BHs run */
1941 bool busy = true;
922453bc
SH
1942 BlockDriverState *bs;
1943
88266f5a 1944 while (busy) {
9b536adc
SH
1945 busy = false;
1946
dc364f4c 1947 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
9b536adc
SH
1948 AioContext *aio_context = bdrv_get_aio_context(bs);
1949 bool bs_busy;
1950
1951 aio_context_acquire(aio_context);
448ad91d 1952 bdrv_flush_io_queue(bs);
0b06ef3b 1953 bdrv_start_throttled_reqs(bs);
9b536adc
SH
1954 bs_busy = bdrv_requests_pending(bs);
1955 bs_busy |= aio_poll(aio_context, bs_busy);
1956 aio_context_release(aio_context);
922453bc 1957
9b536adc
SH
1958 busy |= bs_busy;
1959 }
922453bc
SH
1960 }
1961}
1962
dc364f4c
BC
1963/* make a BlockDriverState anonymous by removing from bdrv_state and
1964 * graph_bdrv_state list.
d22b2f41
RH
1965 Also, NULL terminate the device_name to prevent double remove */
1966void bdrv_make_anon(BlockDriverState *bs)
1967{
1968 if (bs->device_name[0] != '\0') {
dc364f4c 1969 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
d22b2f41
RH
1970 }
1971 bs->device_name[0] = '\0';
dc364f4c
BC
1972 if (bs->node_name[0] != '\0') {
1973 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1974 }
1975 bs->node_name[0] = '\0';
d22b2f41
RH
1976}
1977
e023b2e2
PB
1978static void bdrv_rebind(BlockDriverState *bs)
1979{
1980 if (bs->drv && bs->drv->bdrv_rebind) {
1981 bs->drv->bdrv_rebind(bs);
1982 }
1983}
1984
4ddc07ca
PB
1985static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1986 BlockDriverState *bs_src)
8802d1fd 1987{
4ddc07ca 1988 /* move some fields that need to stay attached to the device */
8802d1fd
JC
1989
1990 /* dev info */
4ddc07ca
PB
1991 bs_dest->dev_ops = bs_src->dev_ops;
1992 bs_dest->dev_opaque = bs_src->dev_opaque;
1993 bs_dest->dev = bs_src->dev;
1b7fd729 1994 bs_dest->guest_block_size = bs_src->guest_block_size;
4ddc07ca 1995 bs_dest->copy_on_read = bs_src->copy_on_read;
8802d1fd 1996
4ddc07ca 1997 bs_dest->enable_write_cache = bs_src->enable_write_cache;
c4a248a1 1998
cc0681c4
BC
1999 /* i/o throttled req */
2000 memcpy(&bs_dest->throttle_state,
2001 &bs_src->throttle_state,
2002 sizeof(ThrottleState));
2003 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
2004 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
4ddc07ca 2005 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
8802d1fd 2006
8802d1fd 2007 /* r/w error */
4ddc07ca
PB
2008 bs_dest->on_read_error = bs_src->on_read_error;
2009 bs_dest->on_write_error = bs_src->on_write_error;
8802d1fd
JC
2010
2011 /* i/o status */
4ddc07ca
PB
2012 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
2013 bs_dest->iostatus = bs_src->iostatus;
8802d1fd 2014
a9fc4408 2015 /* dirty bitmap */
e4654d2d 2016 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
a9fc4408 2017
9fcb0251
FZ
2018 /* reference count */
2019 bs_dest->refcnt = bs_src->refcnt;
2020
a9fc4408 2021 /* job */
4ddc07ca 2022 bs_dest->job = bs_src->job;
a9fc4408 2023
8802d1fd 2024 /* keep the same entry in bdrv_states */
4ddc07ca
PB
2025 pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
2026 bs_src->device_name);
dc364f4c 2027 bs_dest->device_list = bs_src->device_list;
7e7d56d9
MA
2028 bs_dest->blk = bs_src->blk;
2029
fbe40ff7
FZ
2030 memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2031 sizeof(bs_dest->op_blockers));
4ddc07ca 2032}
8802d1fd 2033
4ddc07ca
PB
2034/*
2035 * Swap bs contents for two image chains while they are live,
2036 * while keeping required fields on the BlockDriverState that is
2037 * actually attached to a device.
2038 *
2039 * This will modify the BlockDriverState fields, and swap contents
2040 * between bs_new and bs_old. Both bs_new and bs_old are modified.
2041 *
7e7d56d9 2042 * bs_new must be nameless and not attached to a BlockBackend.
4ddc07ca
PB
2043 *
2044 * This function does not create any image files.
2045 */
2046void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2047{
2048 BlockDriverState tmp;
f6801b83 2049
90ce8a06
BC
2050 /* The code needs to swap the node_name but simply swapping node_list won't
2051 * work so first remove the nodes from the graph list, do the swap then
2052 * insert them back if needed.
2053 */
2054 if (bs_new->node_name[0] != '\0') {
2055 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2056 }
2057 if (bs_old->node_name[0] != '\0') {
2058 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2059 }
2060
7e7d56d9 2061 /* bs_new must be nameless and shouldn't have anything fancy enabled */
4ddc07ca 2062 assert(bs_new->device_name[0] == '\0');
7e7d56d9 2063 assert(!bs_new->blk);
e4654d2d 2064 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
4ddc07ca
PB
2065 assert(bs_new->job == NULL);
2066 assert(bs_new->dev == NULL);
4ddc07ca 2067 assert(bs_new->io_limits_enabled == false);
cc0681c4 2068 assert(!throttle_have_timer(&bs_new->throttle_state));
8802d1fd 2069
4ddc07ca
PB
2070 tmp = *bs_new;
2071 *bs_new = *bs_old;
2072 *bs_old = tmp;
a9fc4408 2073
4ddc07ca
PB
2074 /* there are some fields that should not be swapped, move them back */
2075 bdrv_move_feature_fields(&tmp, bs_old);
2076 bdrv_move_feature_fields(bs_old, bs_new);
2077 bdrv_move_feature_fields(bs_new, &tmp);
8802d1fd 2078
7e7d56d9 2079 /* bs_new must remain nameless and unattached */
4ddc07ca 2080 assert(bs_new->device_name[0] == '\0');
7e7d56d9 2081 assert(!bs_new->blk);
4ddc07ca
PB
2082
2083 /* Check a few fields that should remain attached to the device */
2084 assert(bs_new->dev == NULL);
2085 assert(bs_new->job == NULL);
4ddc07ca 2086 assert(bs_new->io_limits_enabled == false);
cc0681c4 2087 assert(!throttle_have_timer(&bs_new->throttle_state));
e023b2e2 2088
90ce8a06
BC
2089 /* insert the nodes back into the graph node list if needed */
2090 if (bs_new->node_name[0] != '\0') {
2091 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2092 }
2093 if (bs_old->node_name[0] != '\0') {
2094 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2095 }
2096
e023b2e2 2097 bdrv_rebind(bs_new);
4ddc07ca
PB
2098 bdrv_rebind(bs_old);
2099}
2100
2101/*
2102 * Add new bs contents at the top of an image chain while the chain is
2103 * live, while keeping required fields on the top layer.
2104 *
2105 * This will modify the BlockDriverState fields, and swap contents
2106 * between bs_new and bs_top. Both bs_new and bs_top are modified.
2107 *
7e7d56d9 2108 * bs_new must be nameless and not attached to a BlockBackend.
4ddc07ca
PB
2109 *
2110 * This function does not create any image files.
2111 */
2112void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2113{
2114 bdrv_swap(bs_new, bs_top);
2115
2116 /* The contents of 'tmp' will become bs_top, as we are
2117 * swapping bs_new and bs_top contents. */
8d24cce1 2118 bdrv_set_backing_hd(bs_top, bs_new);
8802d1fd
JC
2119}
2120
4f6fd349 2121static void bdrv_delete(BlockDriverState *bs)
b338082b 2122{
fa879d62 2123 assert(!bs->dev);
3e914655 2124 assert(!bs->job);
3718d8ab 2125 assert(bdrv_op_blocker_is_empty(bs));
4f6fd349 2126 assert(!bs->refcnt);
e4654d2d 2127 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
18846dee 2128
e1b5c52e
SH
2129 bdrv_close(bs);
2130
1b7bdbc1 2131 /* remove from list, if necessary */
d22b2f41 2132 bdrv_make_anon(bs);
34c6f050 2133
3ae59580 2134 drive_info_del(drive_get_by_blockdev(bs));
7267c094 2135 g_free(bs);
fc01f7e7
FB
2136}
2137
fa879d62
MA
2138int bdrv_attach_dev(BlockDriverState *bs, void *dev)
2139/* TODO change to DeviceState *dev when all users are qdevified */
18846dee 2140{
fa879d62 2141 if (bs->dev) {
18846dee
MA
2142 return -EBUSY;
2143 }
fa879d62 2144 bs->dev = dev;
28a7282a 2145 bdrv_iostatus_reset(bs);
2a87151f
SH
2146
2147 /* We're expecting I/O from the device so bump up coroutine pool size */
2148 qemu_coroutine_adjust_pool_size(COROUTINE_POOL_RESERVATION);
18846dee
MA
2149 return 0;
2150}
2151
fa879d62
MA
2152/* TODO qdevified devices don't use this, remove when devices are qdevified */
2153void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
18846dee 2154{
fa879d62
MA
2155 if (bdrv_attach_dev(bs, dev) < 0) {
2156 abort();
2157 }
2158}
2159
2160void bdrv_detach_dev(BlockDriverState *bs, void *dev)
2161/* TODO change to DeviceState *dev when all users are qdevified */
2162{
2163 assert(bs->dev == dev);
2164 bs->dev = NULL;
0e49de52
MA
2165 bs->dev_ops = NULL;
2166 bs->dev_opaque = NULL;
1b7fd729 2167 bs->guest_block_size = 512;
2a87151f 2168 qemu_coroutine_adjust_pool_size(-COROUTINE_POOL_RESERVATION);
18846dee
MA
2169}
2170
fa879d62
MA
2171/* TODO change to return DeviceState * when all users are qdevified */
2172void *bdrv_get_attached_dev(BlockDriverState *bs)
18846dee 2173{
fa879d62 2174 return bs->dev;
18846dee
MA
2175}
2176
0e49de52
MA
2177void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
2178 void *opaque)
2179{
2180 bs->dev_ops = ops;
2181 bs->dev_opaque = opaque;
2182}
2183
7d4b4ba5 2184static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
0e49de52 2185{
145feb17 2186 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
6f382ed2 2187 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
7d4b4ba5 2188 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
6f382ed2
LC
2189 if (tray_was_closed) {
2190 /* tray open */
a5ee7bd4
WX
2191 qapi_event_send_device_tray_moved(bdrv_get_device_name(bs),
2192 true, &error_abort);
6f382ed2
LC
2193 }
2194 if (load) {
2195 /* tray close */
a5ee7bd4
WX
2196 qapi_event_send_device_tray_moved(bdrv_get_device_name(bs),
2197 false, &error_abort);
6f382ed2 2198 }
145feb17
MA
2199 }
2200}
2201
2c6942fa
MA
2202bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2203{
2204 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2205}
2206
025ccaa7
PB
2207void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2208{
2209 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2210 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2211 }
2212}
2213
e4def80b
MA
2214bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2215{
2216 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2217 return bs->dev_ops->is_tray_open(bs->dev_opaque);
2218 }
2219 return false;
2220}
2221
145feb17
MA
2222static void bdrv_dev_resize_cb(BlockDriverState *bs)
2223{
2224 if (bs->dev_ops && bs->dev_ops->resize_cb) {
2225 bs->dev_ops->resize_cb(bs->dev_opaque);
0e49de52
MA
2226 }
2227}
2228
f107639a
MA
2229bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2230{
2231 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2232 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2233 }
2234 return false;
2235}
2236
e97fc193
AL
2237/*
2238 * Run consistency checks on an image
2239 *
e076f338 2240 * Returns 0 if the check could be completed (it doesn't mean that the image is
a1c7273b 2241 * free of errors) or -errno when an internal error occurred. The results of the
e076f338 2242 * check are stored in res.
e97fc193 2243 */
4534ff54 2244int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
e97fc193 2245{
908bcd54
HR
2246 if (bs->drv == NULL) {
2247 return -ENOMEDIUM;
2248 }
e97fc193
AL
2249 if (bs->drv->bdrv_check == NULL) {
2250 return -ENOTSUP;
2251 }
2252
e076f338 2253 memset(res, 0, sizeof(*res));
4534ff54 2254 return bs->drv->bdrv_check(bs, res, fix);
e97fc193
AL
2255}
2256
8a426614
KW
2257#define COMMIT_BUF_SECTORS 2048
2258
33e3963e
FB
2259/* commit COW file into the raw image */
2260int bdrv_commit(BlockDriverState *bs)
2261{
19cb3738 2262 BlockDriver *drv = bs->drv;
72706ea4 2263 int64_t sector, total_sectors, length, backing_length;
8a426614 2264 int n, ro, open_flags;
0bce597d 2265 int ret = 0;
72706ea4 2266 uint8_t *buf = NULL;
c2cba3d9 2267 char filename[PATH_MAX];
33e3963e 2268
19cb3738
FB
2269 if (!drv)
2270 return -ENOMEDIUM;
6bb45158 2271
4dca4b63
NS
2272 if (!bs->backing_hd) {
2273 return -ENOTSUP;
33e3963e
FB
2274 }
2275
3718d8ab
FZ
2276 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
2277 bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
2d3735d3
SH
2278 return -EBUSY;
2279 }
2280
4dca4b63 2281 ro = bs->backing_hd->read_only;
c2cba3d9
JM
2282 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2283 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
4dca4b63
NS
2284 open_flags = bs->backing_hd->open_flags;
2285
2286 if (ro) {
0bce597d
JC
2287 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2288 return -EACCES;
4dca4b63 2289 }
ea2384d3 2290 }
33e3963e 2291
72706ea4
JC
2292 length = bdrv_getlength(bs);
2293 if (length < 0) {
2294 ret = length;
2295 goto ro_cleanup;
2296 }
2297
2298 backing_length = bdrv_getlength(bs->backing_hd);
2299 if (backing_length < 0) {
2300 ret = backing_length;
2301 goto ro_cleanup;
2302 }
2303
2304 /* If our top snapshot is larger than the backing file image,
2305 * grow the backing file image if possible. If not possible,
2306 * we must return an error */
2307 if (length > backing_length) {
2308 ret = bdrv_truncate(bs->backing_hd, length);
2309 if (ret < 0) {
2310 goto ro_cleanup;
2311 }
2312 }
2313
2314 total_sectors = length >> BDRV_SECTOR_BITS;
857d4f46
KW
2315
2316 /* qemu_try_blockalign() for bs will choose an alignment that works for
2317 * bs->backing_hd as well, so no need to compare the alignment manually. */
2318 buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2319 if (buf == NULL) {
2320 ret = -ENOMEM;
2321 goto ro_cleanup;
2322 }
8a426614
KW
2323
2324 for (sector = 0; sector < total_sectors; sector += n) {
d663640c
PB
2325 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2326 if (ret < 0) {
2327 goto ro_cleanup;
2328 }
2329 if (ret) {
dabfa6cc
KW
2330 ret = bdrv_read(bs, sector, buf, n);
2331 if (ret < 0) {
8a426614
KW
2332 goto ro_cleanup;
2333 }
2334
dabfa6cc
KW
2335 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2336 if (ret < 0) {
8a426614
KW
2337 goto ro_cleanup;
2338 }
ea2384d3 2339 }
33e3963e 2340 }
95389c86 2341
1d44952f
CH
2342 if (drv->bdrv_make_empty) {
2343 ret = drv->bdrv_make_empty(bs);
dabfa6cc
KW
2344 if (ret < 0) {
2345 goto ro_cleanup;
2346 }
1d44952f
CH
2347 bdrv_flush(bs);
2348 }
95389c86 2349
3f5075ae
CH
2350 /*
2351 * Make sure all data we wrote to the backing device is actually
2352 * stable on disk.
2353 */
dabfa6cc 2354 if (bs->backing_hd) {
3f5075ae 2355 bdrv_flush(bs->backing_hd);
dabfa6cc 2356 }
4dca4b63 2357
dabfa6cc 2358 ret = 0;
4dca4b63 2359ro_cleanup:
857d4f46 2360 qemu_vfree(buf);
4dca4b63
NS
2361
2362 if (ro) {
0bce597d
JC
2363 /* ignoring error return here */
2364 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
4dca4b63
NS
2365 }
2366
1d44952f 2367 return ret;
33e3963e
FB
2368}
2369
e8877497 2370int bdrv_commit_all(void)
6ab4b5ab
MA
2371{
2372 BlockDriverState *bs;
2373
dc364f4c 2374 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
2375 AioContext *aio_context = bdrv_get_aio_context(bs);
2376
2377 aio_context_acquire(aio_context);
272d2d8e
JC
2378 if (bs->drv && bs->backing_hd) {
2379 int ret = bdrv_commit(bs);
2380 if (ret < 0) {
ed78cda3 2381 aio_context_release(aio_context);
272d2d8e
JC
2382 return ret;
2383 }
e8877497 2384 }
ed78cda3 2385 aio_context_release(aio_context);
6ab4b5ab 2386 }
e8877497 2387 return 0;
6ab4b5ab
MA
2388}
2389
dbffbdcf
SH
2390/**
2391 * Remove an active request from the tracked requests list
2392 *
2393 * This function should be called when a tracked request is completing.
2394 */
2395static void tracked_request_end(BdrvTrackedRequest *req)
2396{
2dbafdc0
KW
2397 if (req->serialising) {
2398 req->bs->serialising_in_flight--;
2399 }
2400
dbffbdcf 2401 QLIST_REMOVE(req, list);
f4658285 2402 qemu_co_queue_restart_all(&req->wait_queue);
dbffbdcf
SH
2403}
2404
2405/**
2406 * Add an active request to the tracked requests list
2407 */
2408static void tracked_request_begin(BdrvTrackedRequest *req,
2409 BlockDriverState *bs,
793ed47a
KW
2410 int64_t offset,
2411 unsigned int bytes, bool is_write)
dbffbdcf
SH
2412{
2413 *req = (BdrvTrackedRequest){
2414 .bs = bs,
2dbafdc0
KW
2415 .offset = offset,
2416 .bytes = bytes,
2417 .is_write = is_write,
2418 .co = qemu_coroutine_self(),
2419 .serialising = false,
7327145f
KW
2420 .overlap_offset = offset,
2421 .overlap_bytes = bytes,
dbffbdcf
SH
2422 };
2423
f4658285
SH
2424 qemu_co_queue_init(&req->wait_queue);
2425
dbffbdcf
SH
2426 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2427}
2428
e96126ff 2429static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2dbafdc0 2430{
7327145f 2431 int64_t overlap_offset = req->offset & ~(align - 1);
e96126ff
KW
2432 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2433 - overlap_offset;
7327145f 2434
2dbafdc0
KW
2435 if (!req->serialising) {
2436 req->bs->serialising_in_flight++;
2437 req->serialising = true;
2438 }
7327145f
KW
2439
2440 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2441 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2dbafdc0
KW
2442}
2443
d83947ac
SH
2444/**
2445 * Round a region to cluster boundaries
2446 */
343bded4
PB
2447void bdrv_round_to_clusters(BlockDriverState *bs,
2448 int64_t sector_num, int nb_sectors,
2449 int64_t *cluster_sector_num,
2450 int *cluster_nb_sectors)
d83947ac
SH
2451{
2452 BlockDriverInfo bdi;
2453
2454 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2455 *cluster_sector_num = sector_num;
2456 *cluster_nb_sectors = nb_sectors;
2457 } else {
2458 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2459 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2460 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2461 nb_sectors, c);
2462 }
2463}
2464
7327145f 2465static int bdrv_get_cluster_size(BlockDriverState *bs)
793ed47a
KW
2466{
2467 BlockDriverInfo bdi;
7327145f 2468 int ret;
793ed47a 2469
7327145f
KW
2470 ret = bdrv_get_info(bs, &bdi);
2471 if (ret < 0 || bdi.cluster_size == 0) {
2472 return bs->request_alignment;
793ed47a 2473 } else {
7327145f 2474 return bdi.cluster_size;
793ed47a
KW
2475 }
2476}
2477
f4658285 2478static bool tracked_request_overlaps(BdrvTrackedRequest *req,
793ed47a
KW
2479 int64_t offset, unsigned int bytes)
2480{
d83947ac 2481 /* aaaa bbbb */
7327145f 2482 if (offset >= req->overlap_offset + req->overlap_bytes) {
d83947ac
SH
2483 return false;
2484 }
2485 /* bbbb aaaa */
7327145f 2486 if (req->overlap_offset >= offset + bytes) {
d83947ac
SH
2487 return false;
2488 }
2489 return true;
f4658285
SH
2490}
2491
28de2dcd 2492static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
f4658285 2493{
2dbafdc0 2494 BlockDriverState *bs = self->bs;
f4658285
SH
2495 BdrvTrackedRequest *req;
2496 bool retry;
28de2dcd 2497 bool waited = false;
f4658285 2498
2dbafdc0 2499 if (!bs->serialising_in_flight) {
28de2dcd 2500 return false;
2dbafdc0
KW
2501 }
2502
f4658285
SH
2503 do {
2504 retry = false;
2505 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2dbafdc0 2506 if (req == self || (!req->serialising && !self->serialising)) {
65afd211
KW
2507 continue;
2508 }
7327145f
KW
2509 if (tracked_request_overlaps(req, self->overlap_offset,
2510 self->overlap_bytes))
2511 {
5f8b6491
SH
2512 /* Hitting this means there was a reentrant request, for
2513 * example, a block driver issuing nested requests. This must
2514 * never happen since it means deadlock.
2515 */
2516 assert(qemu_coroutine_self() != req->co);
2517
6460440f
KW
2518 /* If the request is already (indirectly) waiting for us, or
2519 * will wait for us as soon as it wakes up, then just go on
2520 * (instead of producing a deadlock in the former case). */
2521 if (!req->waiting_for) {
2522 self->waiting_for = req;
2523 qemu_co_queue_wait(&req->wait_queue);
2524 self->waiting_for = NULL;
2525 retry = true;
28de2dcd 2526 waited = true;
6460440f
KW
2527 break;
2528 }
f4658285
SH
2529 }
2530 }
2531 } while (retry);
28de2dcd
KW
2532
2533 return waited;
f4658285
SH
2534}
2535
756e6736
KW
2536/*
2537 * Return values:
2538 * 0 - success
2539 * -EINVAL - backing format specified, but no file
2540 * -ENOSPC - can't update the backing file because no space is left in the
2541 * image file header
2542 * -ENOTSUP - format driver doesn't support changing the backing file
2543 */
2544int bdrv_change_backing_file(BlockDriverState *bs,
2545 const char *backing_file, const char *backing_fmt)
2546{
2547 BlockDriver *drv = bs->drv;
469ef350 2548 int ret;
756e6736 2549
5f377794
PB
2550 /* Backing file format doesn't make sense without a backing file */
2551 if (backing_fmt && !backing_file) {
2552 return -EINVAL;
2553 }
2554
756e6736 2555 if (drv->bdrv_change_backing_file != NULL) {
469ef350 2556 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
756e6736 2557 } else {
469ef350 2558 ret = -ENOTSUP;
756e6736 2559 }
469ef350
PB
2560
2561 if (ret == 0) {
2562 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2563 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2564 }
2565 return ret;
756e6736
KW
2566}
2567
6ebdcee2
JC
2568/*
2569 * Finds the image layer in the chain that has 'bs' as its backing file.
2570 *
2571 * active is the current topmost image.
2572 *
2573 * Returns NULL if bs is not found in active's image chain,
2574 * or if active == bs.
4caf0fcd
JC
2575 *
2576 * Returns the bottommost base image if bs == NULL.
6ebdcee2
JC
2577 */
2578BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2579 BlockDriverState *bs)
2580{
4caf0fcd
JC
2581 while (active && bs != active->backing_hd) {
2582 active = active->backing_hd;
6ebdcee2
JC
2583 }
2584
4caf0fcd
JC
2585 return active;
2586}
6ebdcee2 2587
4caf0fcd
JC
2588/* Given a BDS, searches for the base layer. */
2589BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2590{
2591 return bdrv_find_overlay(bs, NULL);
6ebdcee2
JC
2592}
2593
2594typedef struct BlkIntermediateStates {
2595 BlockDriverState *bs;
2596 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2597} BlkIntermediateStates;
2598
2599
2600/*
2601 * Drops images above 'base' up to and including 'top', and sets the image
2602 * above 'top' to have base as its backing file.
2603 *
2604 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2605 * information in 'bs' can be properly updated.
2606 *
2607 * E.g., this will convert the following chain:
2608 * bottom <- base <- intermediate <- top <- active
2609 *
2610 * to
2611 *
2612 * bottom <- base <- active
2613 *
2614 * It is allowed for bottom==base, in which case it converts:
2615 *
2616 * base <- intermediate <- top <- active
2617 *
2618 * to
2619 *
2620 * base <- active
2621 *
54e26900
JC
2622 * If backing_file_str is non-NULL, it will be used when modifying top's
2623 * overlay image metadata.
2624 *
6ebdcee2
JC
2625 * Error conditions:
2626 * if active == top, that is considered an error
2627 *
2628 */
2629int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
54e26900 2630 BlockDriverState *base, const char *backing_file_str)
6ebdcee2
JC
2631{
2632 BlockDriverState *intermediate;
2633 BlockDriverState *base_bs = NULL;
2634 BlockDriverState *new_top_bs = NULL;
2635 BlkIntermediateStates *intermediate_state, *next;
2636 int ret = -EIO;
2637
2638 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2639 QSIMPLEQ_INIT(&states_to_delete);
2640
2641 if (!top->drv || !base->drv) {
2642 goto exit;
2643 }
2644
2645 new_top_bs = bdrv_find_overlay(active, top);
2646
2647 if (new_top_bs == NULL) {
2648 /* we could not find the image above 'top', this is an error */
2649 goto exit;
2650 }
2651
2652 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2653 * to do, no intermediate images */
2654 if (new_top_bs->backing_hd == base) {
2655 ret = 0;
2656 goto exit;
2657 }
2658
2659 intermediate = top;
2660
2661 /* now we will go down through the list, and add each BDS we find
2662 * into our deletion queue, until we hit the 'base'
2663 */
2664 while (intermediate) {
5839e53b 2665 intermediate_state = g_new0(BlkIntermediateStates, 1);
6ebdcee2
JC
2666 intermediate_state->bs = intermediate;
2667 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2668
2669 if (intermediate->backing_hd == base) {
2670 base_bs = intermediate->backing_hd;
2671 break;
2672 }
2673 intermediate = intermediate->backing_hd;
2674 }
2675 if (base_bs == NULL) {
2676 /* something went wrong, we did not end at the base. safely
2677 * unravel everything, and exit with error */
2678 goto exit;
2679 }
2680
2681 /* success - we can delete the intermediate states, and link top->base */
54e26900
JC
2682 backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2683 ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
6ebdcee2
JC
2684 base_bs->drv ? base_bs->drv->format_name : "");
2685 if (ret) {
2686 goto exit;
2687 }
920beae1 2688 bdrv_set_backing_hd(new_top_bs, base_bs);
6ebdcee2
JC
2689
2690 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2691 /* so that bdrv_close() does not recursively close the chain */
920beae1 2692 bdrv_set_backing_hd(intermediate_state->bs, NULL);
4f6fd349 2693 bdrv_unref(intermediate_state->bs);
6ebdcee2
JC
2694 }
2695 ret = 0;
2696
2697exit:
2698 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2699 g_free(intermediate_state);
2700 }
2701 return ret;
2702}
2703
2704
71d0770c
AL
2705static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2706 size_t size)
2707{
2708 int64_t len;
2709
1dd3a447
KW
2710 if (size > INT_MAX) {
2711 return -EIO;
2712 }
2713
71d0770c
AL
2714 if (!bdrv_is_inserted(bs))
2715 return -ENOMEDIUM;
2716
2717 if (bs->growable)
2718 return 0;
2719
2720 len = bdrv_getlength(bs);
2721
fbb7b4e0
KW
2722 if (offset < 0)
2723 return -EIO;
2724
2725 if ((offset > len) || (len - offset < size))
71d0770c
AL
2726 return -EIO;
2727
2728 return 0;
2729}
2730
2731static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2732 int nb_sectors)
2733{
54db38a4 2734 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
8f4754ed
KW
2735 return -EIO;
2736 }
2737
eb5a3165
JS
2738 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2739 nb_sectors * BDRV_SECTOR_SIZE);
71d0770c
AL
2740}
2741
1c9805a3
SH
2742typedef struct RwCo {
2743 BlockDriverState *bs;
775aa8b6 2744 int64_t offset;
1c9805a3
SH
2745 QEMUIOVector *qiov;
2746 bool is_write;
2747 int ret;
4105eaaa 2748 BdrvRequestFlags flags;
1c9805a3
SH
2749} RwCo;
2750
2751static void coroutine_fn bdrv_rw_co_entry(void *opaque)
fc01f7e7 2752{
1c9805a3 2753 RwCo *rwco = opaque;
ea2384d3 2754
1c9805a3 2755 if (!rwco->is_write) {
775aa8b6
KW
2756 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2757 rwco->qiov->size, rwco->qiov,
4105eaaa 2758 rwco->flags);
775aa8b6
KW
2759 } else {
2760 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2761 rwco->qiov->size, rwco->qiov,
2762 rwco->flags);
1c9805a3
SH
2763 }
2764}
e7a8a783 2765
1c9805a3 2766/*
8d3b1a2d 2767 * Process a vectored synchronous request using coroutines
1c9805a3 2768 */
775aa8b6
KW
2769static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2770 QEMUIOVector *qiov, bool is_write,
2771 BdrvRequestFlags flags)
1c9805a3 2772{
1c9805a3
SH
2773 Coroutine *co;
2774 RwCo rwco = {
2775 .bs = bs,
775aa8b6 2776 .offset = offset,
8d3b1a2d 2777 .qiov = qiov,
1c9805a3
SH
2778 .is_write = is_write,
2779 .ret = NOT_DONE,
4105eaaa 2780 .flags = flags,
1c9805a3 2781 };
e7a8a783 2782
498e386c
ZYW
2783 /**
2784 * In sync call context, when the vcpu is blocked, this throttling timer
2785 * will not fire; so the I/O throttling function has to be disabled here
2786 * if it has been enabled.
2787 */
2788 if (bs->io_limits_enabled) {
2789 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2790 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2791 bdrv_io_limits_disable(bs);
2792 }
2793
1c9805a3
SH
2794 if (qemu_in_coroutine()) {
2795 /* Fast-path if already in coroutine context */
2796 bdrv_rw_co_entry(&rwco);
2797 } else {
2572b37a
SH
2798 AioContext *aio_context = bdrv_get_aio_context(bs);
2799
1c9805a3
SH
2800 co = qemu_coroutine_create(bdrv_rw_co_entry);
2801 qemu_coroutine_enter(co, &rwco);
2802 while (rwco.ret == NOT_DONE) {
2572b37a 2803 aio_poll(aio_context, true);
1c9805a3
SH
2804 }
2805 }
2806 return rwco.ret;
2807}
b338082b 2808
8d3b1a2d
KW
2809/*
2810 * Process a synchronous request using coroutines
2811 */
2812static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
4105eaaa 2813 int nb_sectors, bool is_write, BdrvRequestFlags flags)
8d3b1a2d
KW
2814{
2815 QEMUIOVector qiov;
2816 struct iovec iov = {
2817 .iov_base = (void *)buf,
2818 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2819 };
2820
da15ee51
KW
2821 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2822 return -EINVAL;
2823 }
2824
8d3b1a2d 2825 qemu_iovec_init_external(&qiov, &iov, 1);
775aa8b6
KW
2826 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2827 &qiov, is_write, flags);
8d3b1a2d
KW
2828}
2829
1c9805a3
SH
2830/* return < 0 if error. See bdrv_write() for the return codes */
2831int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2832 uint8_t *buf, int nb_sectors)
2833{
4105eaaa 2834 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
fc01f7e7
FB
2835}
2836
07d27a44
MA
2837/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2838int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2839 uint8_t *buf, int nb_sectors)
2840{
2841 bool enabled;
2842 int ret;
2843
2844 enabled = bs->io_limits_enabled;
2845 bs->io_limits_enabled = false;
4e7395e8 2846 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
07d27a44
MA
2847 bs->io_limits_enabled = enabled;
2848 return ret;
2849}
2850
5fafdf24 2851/* Return < 0 if error. Important errors are:
19cb3738
FB
2852 -EIO generic I/O error (may happen for all errors)
2853 -ENOMEDIUM No media inserted.
2854 -EINVAL Invalid sector number or nb_sectors
2855 -EACCES Trying to write a read-only device
2856*/
5fafdf24 2857int bdrv_write(BlockDriverState *bs, int64_t sector_num,
fc01f7e7
FB
2858 const uint8_t *buf, int nb_sectors)
2859{
4105eaaa 2860 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
83f64091
FB
2861}
2862
aa7bfbff
PL
2863int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2864 int nb_sectors, BdrvRequestFlags flags)
4105eaaa
PL
2865{
2866 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
aa7bfbff 2867 BDRV_REQ_ZERO_WRITE | flags);
8d3b1a2d
KW
2868}
2869
d75cbb5e
PL
2870/*
2871 * Completely zero out a block device with the help of bdrv_write_zeroes.
2872 * The operation is sped up by checking the block status and only writing
2873 * zeroes to the device if they currently do not return zeroes. Optional
2874 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2875 *
2876 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2877 */
2878int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2879{
d32f7c10 2880 int64_t target_sectors, ret, nb_sectors, sector_num = 0;
d75cbb5e
PL
2881 int n;
2882
d32f7c10
MA
2883 target_sectors = bdrv_nb_sectors(bs);
2884 if (target_sectors < 0) {
2885 return target_sectors;
9ce10c0b 2886 }
9ce10c0b 2887
d75cbb5e 2888 for (;;) {
d32f7c10 2889 nb_sectors = target_sectors - sector_num;
d75cbb5e
PL
2890 if (nb_sectors <= 0) {
2891 return 0;
2892 }
2893 if (nb_sectors > INT_MAX) {
2894 nb_sectors = INT_MAX;
2895 }
2896 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
3d94ce60
PL
2897 if (ret < 0) {
2898 error_report("error getting block status at sector %" PRId64 ": %s",
2899 sector_num, strerror(-ret));
2900 return ret;
2901 }
d75cbb5e
PL
2902 if (ret & BDRV_BLOCK_ZERO) {
2903 sector_num += n;
2904 continue;
2905 }
2906 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2907 if (ret < 0) {
2908 error_report("error writing zeroes at sector %" PRId64 ": %s",
2909 sector_num, strerror(-ret));
2910 return ret;
2911 }
2912 sector_num += n;
2913 }
2914}
2915
a3ef6571 2916int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
83f64091 2917{
a3ef6571
KW
2918 QEMUIOVector qiov;
2919 struct iovec iov = {
2920 .iov_base = (void *)buf,
2921 .iov_len = bytes,
2922 };
9a8c4cce 2923 int ret;
83f64091 2924
a3ef6571
KW
2925 if (bytes < 0) {
2926 return -EINVAL;
83f64091
FB
2927 }
2928
a3ef6571
KW
2929 qemu_iovec_init_external(&qiov, &iov, 1);
2930 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2931 if (ret < 0) {
2932 return ret;
83f64091 2933 }
a3ef6571
KW
2934
2935 return bytes;
83f64091
FB
2936}
2937
8d3b1a2d 2938int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
83f64091 2939{
9a8c4cce 2940 int ret;
83f64091 2941
8407d5d7
KW
2942 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2943 if (ret < 0) {
2944 return ret;
83f64091
FB
2945 }
2946
8d3b1a2d
KW
2947 return qiov->size;
2948}
2949
2950int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
8407d5d7 2951 const void *buf, int bytes)
8d3b1a2d
KW
2952{
2953 QEMUIOVector qiov;
2954 struct iovec iov = {
2955 .iov_base = (void *) buf,
8407d5d7 2956 .iov_len = bytes,
8d3b1a2d
KW
2957 };
2958
8407d5d7
KW
2959 if (bytes < 0) {
2960 return -EINVAL;
2961 }
2962
8d3b1a2d
KW
2963 qemu_iovec_init_external(&qiov, &iov, 1);
2964 return bdrv_pwritev(bs, offset, &qiov);
83f64091 2965}
83f64091 2966
f08145fe
KW
2967/*
2968 * Writes to the file and ensures that no writes are reordered across this
2969 * request (acts as a barrier)
2970 *
2971 * Returns 0 on success, -errno in error cases.
2972 */
2973int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2974 const void *buf, int count)
2975{
2976 int ret;
2977
2978 ret = bdrv_pwrite(bs, offset, buf, count);
2979 if (ret < 0) {
2980 return ret;
2981 }
2982
f05fa4ad
PB
2983 /* No flush needed for cache modes that already do it */
2984 if (bs->enable_write_cache) {
f08145fe
KW
2985 bdrv_flush(bs);
2986 }
2987
2988 return 0;
2989}
2990
470c0504 2991static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
ab185921
SH
2992 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2993{
2994 /* Perform I/O through a temporary buffer so that users who scribble over
2995 * their read buffer while the operation is in progress do not end up
2996 * modifying the image file. This is critical for zero-copy guest I/O
2997 * where anything might happen inside guest memory.
2998 */
2999 void *bounce_buffer;
3000
79c053bd 3001 BlockDriver *drv = bs->drv;
ab185921
SH
3002 struct iovec iov;
3003 QEMUIOVector bounce_qiov;
3004 int64_t cluster_sector_num;
3005 int cluster_nb_sectors;
3006 size_t skip_bytes;
3007 int ret;
3008
3009 /* Cover entire cluster so no additional backing file I/O is required when
3010 * allocating cluster in the image file.
3011 */
343bded4
PB
3012 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
3013 &cluster_sector_num, &cluster_nb_sectors);
ab185921 3014
470c0504
SH
3015 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
3016 cluster_sector_num, cluster_nb_sectors);
ab185921
SH
3017
3018 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
857d4f46
KW
3019 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
3020 if (bounce_buffer == NULL) {
3021 ret = -ENOMEM;
3022 goto err;
3023 }
3024
ab185921
SH
3025 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
3026
79c053bd
SH
3027 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
3028 &bounce_qiov);
ab185921
SH
3029 if (ret < 0) {
3030 goto err;
3031 }
3032
79c053bd
SH
3033 if (drv->bdrv_co_write_zeroes &&
3034 buffer_is_zero(bounce_buffer, iov.iov_len)) {
621f0589 3035 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
aa7bfbff 3036 cluster_nb_sectors, 0);
79c053bd 3037 } else {
f05fa4ad
PB
3038 /* This does not change the data on the disk, it is not necessary
3039 * to flush even in cache=writethrough mode.
3040 */
79c053bd 3041 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
ab185921 3042 &bounce_qiov);
79c053bd
SH
3043 }
3044
ab185921
SH
3045 if (ret < 0) {
3046 /* It might be okay to ignore write errors for guest requests. If this
3047 * is a deliberate copy-on-read then we don't want to ignore the error.
3048 * Simply report it in all cases.
3049 */
3050 goto err;
3051 }
3052
3053 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
03396148
MT
3054 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
3055 nb_sectors * BDRV_SECTOR_SIZE);
ab185921
SH
3056
3057err:
3058 qemu_vfree(bounce_buffer);
3059 return ret;
3060}
3061
c5fbe571 3062/*
d0c7f642
KW
3063 * Forwards an already correctly aligned request to the BlockDriver. This
3064 * handles copy on read and zeroing after EOF; any other features must be
3065 * implemented by the caller.
c5fbe571 3066 */
d0c7f642 3067static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
65afd211 3068 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
ec746e10 3069 int64_t align, QEMUIOVector *qiov, int flags)
da1fa91d
KW
3070{
3071 BlockDriver *drv = bs->drv;
dbffbdcf 3072 int ret;
da1fa91d 3073
d0c7f642
KW
3074 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3075 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
da1fa91d 3076
d0c7f642
KW
3077 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3078 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
8eb029c2 3079 assert(!qiov || bytes == qiov->size);
d0c7f642
KW
3080
3081 /* Handle Copy on Read and associated serialisation */
470c0504 3082 if (flags & BDRV_REQ_COPY_ON_READ) {
7327145f
KW
3083 /* If we touch the same cluster it counts as an overlap. This
3084 * guarantees that allocating writes will be serialized and not race
3085 * with each other for the same cluster. For example, in copy-on-read
3086 * it ensures that the CoR read and write operations are atomic and
3087 * guest writes cannot interleave between them. */
3088 mark_request_serialising(req, bdrv_get_cluster_size(bs));
470c0504
SH
3089 }
3090
2dbafdc0 3091 wait_serialising_requests(req);
f4658285 3092
470c0504 3093 if (flags & BDRV_REQ_COPY_ON_READ) {
ab185921
SH
3094 int pnum;
3095
bdad13b9 3096 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
ab185921
SH
3097 if (ret < 0) {
3098 goto out;
3099 }
3100
3101 if (!ret || pnum != nb_sectors) {
470c0504 3102 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
ab185921
SH
3103 goto out;
3104 }
3105 }
3106
d0c7f642 3107 /* Forward the request to the BlockDriver */
893a8f62
MK
3108 if (!(bs->zero_beyond_eof && bs->growable)) {
3109 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3110 } else {
3111 /* Read zeros after EOF of growable BDSes */
4049082c 3112 int64_t total_sectors, max_nb_sectors;
893a8f62 3113
4049082c
MA
3114 total_sectors = bdrv_nb_sectors(bs);
3115 if (total_sectors < 0) {
3116 ret = total_sectors;
893a8f62
MK
3117 goto out;
3118 }
3119
5f5bcd80
KW
3120 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3121 align >> BDRV_SECTOR_BITS);
893a8f62 3122 if (max_nb_sectors > 0) {
33f461e0
KW
3123 QEMUIOVector local_qiov;
3124 size_t local_sectors;
3125
3126 max_nb_sectors = MIN(max_nb_sectors, SIZE_MAX / BDRV_SECTOR_BITS);
3127 local_sectors = MIN(max_nb_sectors, nb_sectors);
3128
3129 qemu_iovec_init(&local_qiov, qiov->niov);
3130 qemu_iovec_concat(&local_qiov, qiov, 0,
3131 local_sectors * BDRV_SECTOR_SIZE);
3132
3133 ret = drv->bdrv_co_readv(bs, sector_num, local_sectors,
3134 &local_qiov);
3135
3136 qemu_iovec_destroy(&local_qiov);
893a8f62
MK
3137 } else {
3138 ret = 0;
3139 }
3140
3141 /* Reading beyond end of file is supposed to produce zeroes */
3142 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3143 uint64_t offset = MAX(0, total_sectors - sector_num);
3144 uint64_t bytes = (sector_num + nb_sectors - offset) *
3145 BDRV_SECTOR_SIZE;
3146 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3147 }
3148 }
ab185921
SH
3149
3150out:
dbffbdcf 3151 return ret;
da1fa91d
KW
3152}
3153
d0c7f642
KW
3154/*
3155 * Handle a read request in coroutine context
3156 */
1b0288ae
KW
3157static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3158 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
d0c7f642
KW
3159 BdrvRequestFlags flags)
3160{
3161 BlockDriver *drv = bs->drv;
65afd211
KW
3162 BdrvTrackedRequest req;
3163
1b0288ae
KW
3164 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3165 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3166 uint8_t *head_buf = NULL;
3167 uint8_t *tail_buf = NULL;
3168 QEMUIOVector local_qiov;
3169 bool use_local_qiov = false;
d0c7f642
KW
3170 int ret;
3171
3172 if (!drv) {
3173 return -ENOMEDIUM;
3174 }
1b0288ae 3175 if (bdrv_check_byte_request(bs, offset, bytes)) {
d0c7f642
KW
3176 return -EIO;
3177 }
3178
3179 if (bs->copy_on_read) {
3180 flags |= BDRV_REQ_COPY_ON_READ;
3181 }
3182
3183 /* throttling disk I/O */
3184 if (bs->io_limits_enabled) {
d5103588 3185 bdrv_io_limits_intercept(bs, bytes, false);
1b0288ae
KW
3186 }
3187
3188 /* Align read if necessary by padding qiov */
3189 if (offset & (align - 1)) {
3190 head_buf = qemu_blockalign(bs, align);
3191 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3192 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3193 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3194 use_local_qiov = true;
3195
3196 bytes += offset & (align - 1);
3197 offset = offset & ~(align - 1);
3198 }
3199
3200 if ((offset + bytes) & (align - 1)) {
3201 if (!use_local_qiov) {
3202 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3203 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3204 use_local_qiov = true;
3205 }
3206 tail_buf = qemu_blockalign(bs, align);
3207 qemu_iovec_add(&local_qiov, tail_buf,
3208 align - ((offset + bytes) & (align - 1)));
3209
3210 bytes = ROUND_UP(bytes, align);
3211 }
3212
65afd211 3213 tracked_request_begin(&req, bs, offset, bytes, false);
ec746e10 3214 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
1b0288ae
KW
3215 use_local_qiov ? &local_qiov : qiov,
3216 flags);
65afd211 3217 tracked_request_end(&req);
1b0288ae
KW
3218
3219 if (use_local_qiov) {
3220 qemu_iovec_destroy(&local_qiov);
3221 qemu_vfree(head_buf);
3222 qemu_vfree(tail_buf);
d0c7f642
KW
3223 }
3224
d0c7f642
KW
3225 return ret;
3226}
3227
1b0288ae
KW
3228static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3229 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3230 BdrvRequestFlags flags)
3231{
3232 if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3233 return -EINVAL;
3234 }
3235
3236 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3237 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3238}
3239
c5fbe571 3240int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
da1fa91d
KW
3241 int nb_sectors, QEMUIOVector *qiov)
3242{
c5fbe571 3243 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
da1fa91d 3244
470c0504
SH
3245 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3246}
3247
3248int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3249 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3250{
3251 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3252
3253 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3254 BDRV_REQ_COPY_ON_READ);
c5fbe571
SH
3255}
3256
c31cb707
PL
3257/* if no limit is specified in the BlockLimits use a default
3258 * of 32768 512-byte sectors (16 MiB) per request.
3259 */
3260#define MAX_WRITE_ZEROES_DEFAULT 32768
3261
f08f2dda 3262static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
aa7bfbff 3263 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
f08f2dda
SH
3264{
3265 BlockDriver *drv = bs->drv;
3266 QEMUIOVector qiov;
c31cb707
PL
3267 struct iovec iov = {0};
3268 int ret = 0;
f08f2dda 3269
c31cb707
PL
3270 int max_write_zeroes = bs->bl.max_write_zeroes ?
3271 bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
621f0589 3272
c31cb707
PL
3273 while (nb_sectors > 0 && !ret) {
3274 int num = nb_sectors;
3275
b8d71c09
PB
3276 /* Align request. Block drivers can expect the "bulk" of the request
3277 * to be aligned.
3278 */
3279 if (bs->bl.write_zeroes_alignment
3280 && num > bs->bl.write_zeroes_alignment) {
3281 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3282 /* Make a small request up to the first aligned sector. */
c31cb707 3283 num = bs->bl.write_zeroes_alignment;
b8d71c09
PB
3284 num -= sector_num % bs->bl.write_zeroes_alignment;
3285 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3286 /* Shorten the request to the last aligned sector. num cannot
3287 * underflow because num > bs->bl.write_zeroes_alignment.
3288 */
3289 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
c31cb707 3290 }
621f0589 3291 }
f08f2dda 3292
c31cb707
PL
3293 /* limit request size */
3294 if (num > max_write_zeroes) {
3295 num = max_write_zeroes;
3296 }
3297
3298 ret = -ENOTSUP;
3299 /* First try the efficient write zeroes operation */
3300 if (drv->bdrv_co_write_zeroes) {
3301 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3302 }
3303
3304 if (ret == -ENOTSUP) {
3305 /* Fall back to bounce buffer if write zeroes is unsupported */
3306 iov.iov_len = num * BDRV_SECTOR_SIZE;
3307 if (iov.iov_base == NULL) {
857d4f46
KW
3308 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3309 if (iov.iov_base == NULL) {
3310 ret = -ENOMEM;
3311 goto fail;
3312 }
b8d71c09 3313 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
c31cb707
PL
3314 }
3315 qemu_iovec_init_external(&qiov, &iov, 1);
f08f2dda 3316
c31cb707 3317 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
b8d71c09
PB
3318
3319 /* Keep bounce buffer around if it is big enough for all
3320 * all future requests.
3321 */
3322 if (num < max_write_zeroes) {
3323 qemu_vfree(iov.iov_base);
3324 iov.iov_base = NULL;
3325 }
c31cb707
PL
3326 }
3327
3328 sector_num += num;
3329 nb_sectors -= num;
3330 }
f08f2dda 3331
857d4f46 3332fail:
f08f2dda
SH
3333 qemu_vfree(iov.iov_base);
3334 return ret;
3335}
3336
c5fbe571 3337/*
b404f720 3338 * Forwards an already correctly aligned write request to the BlockDriver.
c5fbe571 3339 */
b404f720 3340static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
65afd211
KW
3341 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3342 QEMUIOVector *qiov, int flags)
c5fbe571
SH
3343{
3344 BlockDriver *drv = bs->drv;
28de2dcd 3345 bool waited;
6b7cb247 3346 int ret;
da1fa91d 3347
b404f720
KW
3348 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3349 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
f4658285 3350
b404f720
KW
3351 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3352 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
8eb029c2 3353 assert(!qiov || bytes == qiov->size);
cc0681c4 3354
28de2dcd
KW
3355 waited = wait_serialising_requests(req);
3356 assert(!waited || !req->serialising);
af91f9a7
KW
3357 assert(req->overlap_offset <= offset);
3358 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
244eadef 3359
65afd211 3360 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
d616b224 3361
465bee1d
PL
3362 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3363 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3364 qemu_iovec_is_zero(qiov)) {
3365 flags |= BDRV_REQ_ZERO_WRITE;
3366 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3367 flags |= BDRV_REQ_MAY_UNMAP;
3368 }
3369 }
3370
d616b224
SH
3371 if (ret < 0) {
3372 /* Do nothing, write notifier decided to fail this request */
3373 } else if (flags & BDRV_REQ_ZERO_WRITE) {
9e1cb96d 3374 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
aa7bfbff 3375 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
f08f2dda 3376 } else {
9e1cb96d 3377 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
f08f2dda
SH
3378 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3379 }
9e1cb96d 3380 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
6b7cb247 3381
f05fa4ad
PB
3382 if (ret == 0 && !bs->enable_write_cache) {
3383 ret = bdrv_co_flush(bs);
3384 }
3385
e4654d2d 3386 bdrv_set_dirty(bs, sector_num, nb_sectors);
da1fa91d 3387
5366d0c8 3388 block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
5e5a94b6 3389
df2a6f29
PB
3390 if (bs->growable && ret >= 0) {
3391 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3392 }
da1fa91d 3393
6b7cb247 3394 return ret;
da1fa91d
KW
3395}
3396
b404f720
KW
3397/*
3398 * Handle a write request in coroutine context
3399 */
6601553e
KW
3400static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3401 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
b404f720
KW
3402 BdrvRequestFlags flags)
3403{
65afd211 3404 BdrvTrackedRequest req;
3b8242e0
KW
3405 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3406 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3407 uint8_t *head_buf = NULL;
3408 uint8_t *tail_buf = NULL;
3409 QEMUIOVector local_qiov;
3410 bool use_local_qiov = false;
b404f720
KW
3411 int ret;
3412
3413 if (!bs->drv) {
3414 return -ENOMEDIUM;
3415 }
3416 if (bs->read_only) {
3417 return -EACCES;
3418 }
6601553e 3419 if (bdrv_check_byte_request(bs, offset, bytes)) {
b404f720
KW
3420 return -EIO;
3421 }
3422
b404f720
KW
3423 /* throttling disk I/O */
3424 if (bs->io_limits_enabled) {
d5103588 3425 bdrv_io_limits_intercept(bs, bytes, true);
b404f720
KW
3426 }
3427
3b8242e0
KW
3428 /*
3429 * Align write if necessary by performing a read-modify-write cycle.
3430 * Pad qiov with the read parts and be sure to have a tracked request not
3431 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3432 */
65afd211 3433 tracked_request_begin(&req, bs, offset, bytes, true);
3b8242e0
KW
3434
3435 if (offset & (align - 1)) {
3436 QEMUIOVector head_qiov;
3437 struct iovec head_iov;
3438
3439 mark_request_serialising(&req, align);
3440 wait_serialising_requests(&req);
3441
3442 head_buf = qemu_blockalign(bs, align);
3443 head_iov = (struct iovec) {
3444 .iov_base = head_buf,
3445 .iov_len = align,
3446 };
3447 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3448
9e1cb96d 3449 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3b8242e0
KW
3450 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3451 align, &head_qiov, 0);
3452 if (ret < 0) {
3453 goto fail;
3454 }
9e1cb96d 3455 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3b8242e0
KW
3456
3457 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3458 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3459 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3460 use_local_qiov = true;
3461
3462 bytes += offset & (align - 1);
3463 offset = offset & ~(align - 1);
3464 }
3465
3466 if ((offset + bytes) & (align - 1)) {
3467 QEMUIOVector tail_qiov;
3468 struct iovec tail_iov;
3469 size_t tail_bytes;
28de2dcd 3470 bool waited;
3b8242e0
KW
3471
3472 mark_request_serialising(&req, align);
28de2dcd
KW
3473 waited = wait_serialising_requests(&req);
3474 assert(!waited || !use_local_qiov);
3b8242e0
KW
3475
3476 tail_buf = qemu_blockalign(bs, align);
3477 tail_iov = (struct iovec) {
3478 .iov_base = tail_buf,
3479 .iov_len = align,
3480 };
3481 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3482
9e1cb96d 3483 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3b8242e0
KW
3484 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3485 align, &tail_qiov, 0);
3486 if (ret < 0) {
3487 goto fail;
3488 }
9e1cb96d 3489 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3b8242e0
KW
3490
3491 if (!use_local_qiov) {
3492 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3493 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3494 use_local_qiov = true;
3495 }
3496
3497 tail_bytes = (offset + bytes) & (align - 1);
3498 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3499
3500 bytes = ROUND_UP(bytes, align);
3501 }
3502
3503 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3504 use_local_qiov ? &local_qiov : qiov,
3505 flags);
3506
3507fail:
65afd211 3508 tracked_request_end(&req);
b404f720 3509
3b8242e0
KW
3510 if (use_local_qiov) {
3511 qemu_iovec_destroy(&local_qiov);
3b8242e0 3512 }
99c4a85c
KW
3513 qemu_vfree(head_buf);
3514 qemu_vfree(tail_buf);
3b8242e0 3515
b404f720
KW
3516 return ret;
3517}
3518
6601553e
KW
3519static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3520 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3521 BdrvRequestFlags flags)
3522{
3523 if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3524 return -EINVAL;
3525 }
3526
3527 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3528 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3529}
3530
c5fbe571
SH
3531int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3532 int nb_sectors, QEMUIOVector *qiov)
3533{
3534 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3535
f08f2dda
SH
3536 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3537}
3538
3539int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
aa7bfbff
PL
3540 int64_t sector_num, int nb_sectors,
3541 BdrvRequestFlags flags)
f08f2dda 3542{
94d6ff21 3543 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
f08f2dda 3544
d32f35cb
PL
3545 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3546 flags &= ~BDRV_REQ_MAY_UNMAP;
3547 }
3548
f08f2dda 3549 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
aa7bfbff 3550 BDRV_REQ_ZERO_WRITE | flags);
c5fbe571
SH
3551}
3552
83f64091
FB
3553/**
3554 * Truncate file to 'offset' bytes (needed only for file protocols)
3555 */
3556int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3557{
3558 BlockDriver *drv = bs->drv;
51762288 3559 int ret;
83f64091 3560 if (!drv)
19cb3738 3561 return -ENOMEDIUM;
83f64091
FB
3562 if (!drv->bdrv_truncate)
3563 return -ENOTSUP;
59f2689d
NS
3564 if (bs->read_only)
3565 return -EACCES;
9c75e168 3566
51762288
SH
3567 ret = drv->bdrv_truncate(bs, offset);
3568 if (ret == 0) {
3569 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
145feb17 3570 bdrv_dev_resize_cb(bs);
51762288
SH
3571 }
3572 return ret;
83f64091
FB
3573}
3574
4a1d5e1f
FZ
3575/**
3576 * Length of a allocated file in bytes. Sparse files are counted by actual
3577 * allocated space. Return < 0 if error or unknown.
3578 */
3579int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3580{
3581 BlockDriver *drv = bs->drv;
3582 if (!drv) {
3583 return -ENOMEDIUM;
3584 }
3585 if (drv->bdrv_get_allocated_file_size) {
3586 return drv->bdrv_get_allocated_file_size(bs);
3587 }
3588 if (bs->file) {
3589 return bdrv_get_allocated_file_size(bs->file);
3590 }
3591 return -ENOTSUP;
3592}
3593
83f64091 3594/**
65a9bb25 3595 * Return number of sectors on success, -errno on error.
83f64091 3596 */
65a9bb25 3597int64_t bdrv_nb_sectors(BlockDriverState *bs)
83f64091
FB
3598{
3599 BlockDriver *drv = bs->drv;
65a9bb25 3600
83f64091 3601 if (!drv)
19cb3738 3602 return -ENOMEDIUM;
51762288 3603
b94a2610
KW
3604 if (drv->has_variable_length) {
3605 int ret = refresh_total_sectors(bs, bs->total_sectors);
3606 if (ret < 0) {
3607 return ret;
46a4e4e6 3608 }
83f64091 3609 }
65a9bb25
MA
3610 return bs->total_sectors;
3611}
3612
3613/**
3614 * Return length in bytes on success, -errno on error.
3615 * The length is always a multiple of BDRV_SECTOR_SIZE.
3616 */
3617int64_t bdrv_getlength(BlockDriverState *bs)
3618{
3619 int64_t ret = bdrv_nb_sectors(bs);
3620
3621 return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
fc01f7e7
FB
3622}
3623
19cb3738 3624/* return 0 as number of sectors if no device present or error */
96b8f136 3625void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
fc01f7e7 3626{
65a9bb25
MA
3627 int64_t nb_sectors = bdrv_nb_sectors(bs);
3628
3629 *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
fc01f7e7 3630}
cf98951b 3631
ff06f5f3
PB
3632void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3633 BlockdevOnError on_write_error)
abd7f68d
MA
3634{
3635 bs->on_read_error = on_read_error;
3636 bs->on_write_error = on_write_error;
3637}
3638
1ceee0d5 3639BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
abd7f68d
MA
3640{
3641 return is_read ? bs->on_read_error : bs->on_write_error;
3642}
3643
3e1caa5f
PB
3644BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3645{
3646 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3647
3648 switch (on_err) {
3649 case BLOCKDEV_ON_ERROR_ENOSPC:
a589569f
WX
3650 return (error == ENOSPC) ?
3651 BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3e1caa5f 3652 case BLOCKDEV_ON_ERROR_STOP:
a589569f 3653 return BLOCK_ERROR_ACTION_STOP;
3e1caa5f 3654 case BLOCKDEV_ON_ERROR_REPORT:
a589569f 3655 return BLOCK_ERROR_ACTION_REPORT;
3e1caa5f 3656 case BLOCKDEV_ON_ERROR_IGNORE:
a589569f 3657 return BLOCK_ERROR_ACTION_IGNORE;
3e1caa5f
PB
3658 default:
3659 abort();
3660 }
3661}
3662
c7c2ff0c
LC
3663static void send_qmp_error_event(BlockDriverState *bs,
3664 BlockErrorAction action,
3665 bool is_read, int error)
3666{
3667 BlockErrorAction ac;
3668
3669 ac = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
3670 qapi_event_send_block_io_error(bdrv_get_device_name(bs), ac, action,
3671 bdrv_iostatus_is_enabled(bs),
624ff573
LC
3672 error == ENOSPC, strerror(error),
3673 &error_abort);
c7c2ff0c
LC
3674}
3675
3e1caa5f
PB
3676/* This is done by device models because, while the block layer knows
3677 * about the error, it does not know whether an operation comes from
3678 * the device or the block layer (from a job, for example).
3679 */
3680void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3681 bool is_read, int error)
3682{
3683 assert(error >= 0);
2bd3bce8 3684
a589569f 3685 if (action == BLOCK_ERROR_ACTION_STOP) {
2bd3bce8
PB
3686 /* First set the iostatus, so that "info block" returns an iostatus
3687 * that matches the events raised so far (an additional error iostatus
3688 * is fine, but not a lost one).
3689 */
3e1caa5f 3690 bdrv_iostatus_set_err(bs, error);
2bd3bce8
PB
3691
3692 /* Then raise the request to stop the VM and the event.
3693 * qemu_system_vmstop_request_prepare has two effects. First,
3694 * it ensures that the STOP event always comes after the
3695 * BLOCK_IO_ERROR event. Second, it ensures that even if management
3696 * can observe the STOP event and do a "cont" before the STOP
3697 * event is issued, the VM will not stop. In this case, vm_start()
3698 * also ensures that the STOP/RESUME pair of events is emitted.
3699 */
3700 qemu_system_vmstop_request_prepare();
c7c2ff0c 3701 send_qmp_error_event(bs, action, is_read, error);
2bd3bce8
PB
3702 qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3703 } else {
c7c2ff0c 3704 send_qmp_error_event(bs, action, is_read, error);
3e1caa5f
PB
3705 }
3706}
3707
b338082b
FB
3708int bdrv_is_read_only(BlockDriverState *bs)
3709{
3710 return bs->read_only;
3711}
3712
985a03b0
TS
3713int bdrv_is_sg(BlockDriverState *bs)
3714{
3715 return bs->sg;
3716}
3717
e900a7b7
CH
3718int bdrv_enable_write_cache(BlockDriverState *bs)
3719{
3720 return bs->enable_write_cache;
3721}
3722
425b0148
PB
3723void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3724{
3725 bs->enable_write_cache = wce;
55b110f2
JC
3726
3727 /* so a reopen() will preserve wce */
3728 if (wce) {
3729 bs->open_flags |= BDRV_O_CACHE_WB;
3730 } else {
3731 bs->open_flags &= ~BDRV_O_CACHE_WB;
3732 }
425b0148
PB
3733}
3734
ea2384d3
FB
3735int bdrv_is_encrypted(BlockDriverState *bs)
3736{
3737 if (bs->backing_hd && bs->backing_hd->encrypted)
3738 return 1;
3739 return bs->encrypted;
3740}
3741
c0f4ce77
AL
3742int bdrv_key_required(BlockDriverState *bs)
3743{
3744 BlockDriverState *backing_hd = bs->backing_hd;
3745
3746 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3747 return 1;
3748 return (bs->encrypted && !bs->valid_key);
3749}
3750
ea2384d3
FB
3751int bdrv_set_key(BlockDriverState *bs, const char *key)
3752{
3753 int ret;
3754 if (bs->backing_hd && bs->backing_hd->encrypted) {
3755 ret = bdrv_set_key(bs->backing_hd, key);
3756 if (ret < 0)
3757 return ret;
3758 if (!bs->encrypted)
3759 return 0;
3760 }
fd04a2ae
SH
3761 if (!bs->encrypted) {
3762 return -EINVAL;
3763 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3764 return -ENOMEDIUM;
3765 }
c0f4ce77 3766 ret = bs->drv->bdrv_set_key(bs, key);
bb5fc20f
AL
3767 if (ret < 0) {
3768 bs->valid_key = 0;
3769 } else if (!bs->valid_key) {
3770 bs->valid_key = 1;
3771 /* call the change callback now, we skipped it on open */
7d4b4ba5 3772 bdrv_dev_change_media_cb(bs, true);
bb5fc20f 3773 }
c0f4ce77 3774 return ret;
ea2384d3
FB
3775}
3776
f8d6bba1 3777const char *bdrv_get_format_name(BlockDriverState *bs)
ea2384d3 3778{
f8d6bba1 3779 return bs->drv ? bs->drv->format_name : NULL;
ea2384d3
FB
3780}
3781
ada42401
SH
3782static int qsort_strcmp(const void *a, const void *b)
3783{
3784 return strcmp(a, b);
3785}
3786
5fafdf24 3787void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
ea2384d3
FB
3788 void *opaque)
3789{
3790 BlockDriver *drv;
e855e4fb 3791 int count = 0;
ada42401 3792 int i;
e855e4fb 3793 const char **formats = NULL;
ea2384d3 3794
8a22f02a 3795 QLIST_FOREACH(drv, &bdrv_drivers, list) {
e855e4fb
JC
3796 if (drv->format_name) {
3797 bool found = false;
3798 int i = count;
3799 while (formats && i && !found) {
3800 found = !strcmp(formats[--i], drv->format_name);
3801 }
3802
3803 if (!found) {
5839e53b 3804 formats = g_renew(const char *, formats, count + 1);
e855e4fb 3805 formats[count++] = drv->format_name;
e855e4fb
JC
3806 }
3807 }
ea2384d3 3808 }
ada42401
SH
3809
3810 qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3811
3812 for (i = 0; i < count; i++) {
3813 it(opaque, formats[i]);
3814 }
3815
e855e4fb 3816 g_free(formats);
ea2384d3
FB
3817}
3818
dc364f4c 3819/* This function is to find block backend bs */
b338082b
FB
3820BlockDriverState *bdrv_find(const char *name)
3821{
3822 BlockDriverState *bs;
3823
dc364f4c 3824 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1b7bdbc1 3825 if (!strcmp(name, bs->device_name)) {
b338082b 3826 return bs;
1b7bdbc1 3827 }
b338082b
FB
3828 }
3829 return NULL;
3830}
3831
dc364f4c
BC
3832/* This function is to find a node in the bs graph */
3833BlockDriverState *bdrv_find_node(const char *node_name)
3834{
3835 BlockDriverState *bs;
3836
3837 assert(node_name);
3838
3839 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3840 if (!strcmp(node_name, bs->node_name)) {
3841 return bs;
3842 }
3843 }
3844 return NULL;
3845}
3846
c13163fb
BC
3847/* Put this QMP function here so it can access the static graph_bdrv_states. */
3848BlockDeviceInfoList *bdrv_named_nodes_list(void)
3849{
3850 BlockDeviceInfoList *list, *entry;
3851 BlockDriverState *bs;
3852
3853 list = NULL;
3854 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3855 entry = g_malloc0(sizeof(*entry));
3856 entry->value = bdrv_block_device_info(bs);
3857 entry->next = list;
3858 list = entry;
3859 }
3860
3861 return list;
3862}
3863
12d3ba82
BC
3864BlockDriverState *bdrv_lookup_bs(const char *device,
3865 const char *node_name,
3866 Error **errp)
3867{
3868 BlockDriverState *bs = NULL;
3869
12d3ba82
BC
3870 if (device) {
3871 bs = bdrv_find(device);
3872
dd67fa50
BC
3873 if (bs) {
3874 return bs;
12d3ba82 3875 }
12d3ba82
BC
3876 }
3877
dd67fa50
BC
3878 if (node_name) {
3879 bs = bdrv_find_node(node_name);
12d3ba82 3880
dd67fa50
BC
3881 if (bs) {
3882 return bs;
3883 }
12d3ba82
BC
3884 }
3885
dd67fa50
BC
3886 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3887 device ? device : "",
3888 node_name ? node_name : "");
3889 return NULL;
12d3ba82
BC
3890}
3891
5a6684d2
JC
3892/* If 'base' is in the same chain as 'top', return true. Otherwise,
3893 * return false. If either argument is NULL, return false. */
3894bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3895{
3896 while (top && top != base) {
3897 top = top->backing_hd;
3898 }
3899
3900 return top != NULL;
3901}
3902
2f399b0a
MA
3903BlockDriverState *bdrv_next(BlockDriverState *bs)
3904{
3905 if (!bs) {
3906 return QTAILQ_FIRST(&bdrv_states);
3907 }
dc364f4c 3908 return QTAILQ_NEXT(bs, device_list);
2f399b0a
MA
3909}
3910
51de9760 3911void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
81d0912d
FB
3912{
3913 BlockDriverState *bs;
3914
dc364f4c 3915 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
51de9760 3916 it(opaque, bs);
81d0912d
FB
3917 }
3918}
3919
ea2384d3
FB
3920const char *bdrv_get_device_name(BlockDriverState *bs)
3921{
3922 return bs->device_name;
3923}
3924
c8433287
MA
3925int bdrv_get_flags(BlockDriverState *bs)
3926{
3927 return bs->open_flags;
3928}
3929
f0f0fdfe 3930int bdrv_flush_all(void)
c6ca28d6
AL
3931{
3932 BlockDriverState *bs;
f0f0fdfe 3933 int result = 0;
c6ca28d6 3934
dc364f4c 3935 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
3936 AioContext *aio_context = bdrv_get_aio_context(bs);
3937 int ret;
3938
3939 aio_context_acquire(aio_context);
3940 ret = bdrv_flush(bs);
f0f0fdfe
KW
3941 if (ret < 0 && !result) {
3942 result = ret;
3943 }
ed78cda3 3944 aio_context_release(aio_context);
1b7bdbc1 3945 }
f0f0fdfe
KW
3946
3947 return result;
c6ca28d6
AL
3948}
3949
3ac21627
PL
3950int bdrv_has_zero_init_1(BlockDriverState *bs)
3951{
3952 return 1;
3953}
3954
f2feebbd
KW
3955int bdrv_has_zero_init(BlockDriverState *bs)
3956{
3957 assert(bs->drv);
3958
11212d8f
PB
3959 /* If BS is a copy on write image, it is initialized to
3960 the contents of the base image, which may not be zeroes. */
3961 if (bs->backing_hd) {
3962 return 0;
3963 }
336c1c12
KW
3964 if (bs->drv->bdrv_has_zero_init) {
3965 return bs->drv->bdrv_has_zero_init(bs);
f2feebbd
KW
3966 }
3967
3ac21627
PL
3968 /* safe default */
3969 return 0;
f2feebbd
KW
3970}
3971
4ce78691
PL
3972bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3973{
3974 BlockDriverInfo bdi;
3975
3976 if (bs->backing_hd) {
3977 return false;
3978 }
3979
3980 if (bdrv_get_info(bs, &bdi) == 0) {
3981 return bdi.unallocated_blocks_are_zero;
3982 }
3983
3984 return false;
3985}
3986
3987bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3988{
3989 BlockDriverInfo bdi;
3990
3991 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3992 return false;
3993 }
3994
3995 if (bdrv_get_info(bs, &bdi) == 0) {
3996 return bdi.can_write_zeroes_with_unmap;
3997 }
3998
3999 return false;
4000}
4001
b6b8a333 4002typedef struct BdrvCoGetBlockStatusData {
376ae3f1 4003 BlockDriverState *bs;
b35b2bba 4004 BlockDriverState *base;
376ae3f1
SH
4005 int64_t sector_num;
4006 int nb_sectors;
4007 int *pnum;
b6b8a333 4008 int64_t ret;
376ae3f1 4009 bool done;
b6b8a333 4010} BdrvCoGetBlockStatusData;
376ae3f1 4011
f58c7b35
TS
4012/*
4013 * Returns true iff the specified sector is present in the disk image. Drivers
4014 * not implementing the functionality are assumed to not support backing files,
4015 * hence all their sectors are reported as allocated.
4016 *
bd9533e3
SH
4017 * If 'sector_num' is beyond the end of the disk image the return value is 0
4018 * and 'pnum' is set to 0.
4019 *
f58c7b35
TS
4020 * 'pnum' is set to the number of sectors (including and immediately following
4021 * the specified sector) that are known to be in the same
4022 * allocated/unallocated state.
4023 *
bd9533e3
SH
4024 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
4025 * beyond the end of the disk image it will be clamped.
f58c7b35 4026 */
b6b8a333
PB
4027static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
4028 int64_t sector_num,
4029 int nb_sectors, int *pnum)
f58c7b35 4030{
30a7f2fc 4031 int64_t total_sectors;
bd9533e3 4032 int64_t n;
5daa74a6 4033 int64_t ret, ret2;
bd9533e3 4034
30a7f2fc
MA
4035 total_sectors = bdrv_nb_sectors(bs);
4036 if (total_sectors < 0) {
4037 return total_sectors;
617ccb46
PB
4038 }
4039
30a7f2fc 4040 if (sector_num >= total_sectors) {
bd9533e3
SH
4041 *pnum = 0;
4042 return 0;
4043 }
4044
30a7f2fc 4045 n = total_sectors - sector_num;
bd9533e3
SH
4046 if (n < nb_sectors) {
4047 nb_sectors = n;
4048 }
4049
b6b8a333 4050 if (!bs->drv->bdrv_co_get_block_status) {
bd9533e3 4051 *pnum = nb_sectors;
e88ae226 4052 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
918e92d7
PB
4053 if (bs->drv->protocol_name) {
4054 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
4055 }
4056 return ret;
f58c7b35 4057 }
6aebab14 4058
415b5b01
PB
4059 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
4060 if (ret < 0) {
3e0a233d 4061 *pnum = 0;
415b5b01
PB
4062 return ret;
4063 }
4064
92bc50a5
PL
4065 if (ret & BDRV_BLOCK_RAW) {
4066 assert(ret & BDRV_BLOCK_OFFSET_VALID);
4067 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4068 *pnum, pnum);
4069 }
4070
e88ae226
KW
4071 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
4072 ret |= BDRV_BLOCK_ALLOCATED;
4073 }
4074
c3d86884
PL
4075 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
4076 if (bdrv_unallocated_blocks_are_zero(bs)) {
f0ad5712 4077 ret |= BDRV_BLOCK_ZERO;
1f9db224 4078 } else if (bs->backing_hd) {
f0ad5712 4079 BlockDriverState *bs2 = bs->backing_hd;
30a7f2fc
MA
4080 int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
4081 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
f0ad5712
PB
4082 ret |= BDRV_BLOCK_ZERO;
4083 }
4084 }
415b5b01 4085 }
5daa74a6
PB
4086
4087 if (bs->file &&
4088 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
4089 (ret & BDRV_BLOCK_OFFSET_VALID)) {
4090 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4091 *pnum, pnum);
4092 if (ret2 >= 0) {
4093 /* Ignore errors. This is just providing extra information, it
4094 * is useful but not necessary.
4095 */
4096 ret |= (ret2 & BDRV_BLOCK_ZERO);
4097 }
4098 }
4099
415b5b01 4100 return ret;
060f51c9
SH
4101}
4102
b6b8a333
PB
4103/* Coroutine wrapper for bdrv_get_block_status() */
4104static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
060f51c9 4105{
b6b8a333 4106 BdrvCoGetBlockStatusData *data = opaque;
060f51c9
SH
4107 BlockDriverState *bs = data->bs;
4108
b6b8a333
PB
4109 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4110 data->pnum);
060f51c9
SH
4111 data->done = true;
4112}
4113
4114/*
b6b8a333 4115 * Synchronous wrapper around bdrv_co_get_block_status().
060f51c9 4116 *
b6b8a333 4117 * See bdrv_co_get_block_status() for details.
060f51c9 4118 */
b6b8a333
PB
4119int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4120 int nb_sectors, int *pnum)
060f51c9 4121{
6aebab14 4122 Coroutine *co;
b6b8a333 4123 BdrvCoGetBlockStatusData data = {
6aebab14
SH
4124 .bs = bs,
4125 .sector_num = sector_num,
4126 .nb_sectors = nb_sectors,
4127 .pnum = pnum,
4128 .done = false,
4129 };
4130
bdad13b9
PB
4131 if (qemu_in_coroutine()) {
4132 /* Fast-path if already in coroutine context */
b6b8a333 4133 bdrv_get_block_status_co_entry(&data);
bdad13b9 4134 } else {
2572b37a
SH
4135 AioContext *aio_context = bdrv_get_aio_context(bs);
4136
b6b8a333 4137 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
bdad13b9
PB
4138 qemu_coroutine_enter(co, &data);
4139 while (!data.done) {
2572b37a 4140 aio_poll(aio_context, true);
bdad13b9 4141 }
6aebab14
SH
4142 }
4143 return data.ret;
f58c7b35
TS
4144}
4145
b6b8a333
PB
4146int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4147 int nb_sectors, int *pnum)
4148{
4333bb71
PB
4149 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4150 if (ret < 0) {
4151 return ret;
4152 }
01fb2705 4153 return !!(ret & BDRV_BLOCK_ALLOCATED);
b6b8a333
PB
4154}
4155
188a7bbf
PB
4156/*
4157 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4158 *
4159 * Return true if the given sector is allocated in any image between
4160 * BASE and TOP (inclusive). BASE can be NULL to check if the given
4161 * sector is allocated in any image of the chain. Return false otherwise.
4162 *
4163 * 'pnum' is set to the number of sectors (including and immediately following
4164 * the specified sector) that are known to be in the same
4165 * allocated/unallocated state.
4166 *
4167 */
4f578637
PB
4168int bdrv_is_allocated_above(BlockDriverState *top,
4169 BlockDriverState *base,
4170 int64_t sector_num,
4171 int nb_sectors, int *pnum)
188a7bbf
PB
4172{
4173 BlockDriverState *intermediate;
4174 int ret, n = nb_sectors;
4175
4176 intermediate = top;
4177 while (intermediate && intermediate != base) {
4178 int pnum_inter;
bdad13b9
PB
4179 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4180 &pnum_inter);
188a7bbf
PB
4181 if (ret < 0) {
4182 return ret;
4183 } else if (ret) {
4184 *pnum = pnum_inter;
4185 return 1;
4186 }
4187
4188 /*
4189 * [sector_num, nb_sectors] is unallocated on top but intermediate
4190 * might have
4191 *
4192 * [sector_num+x, nr_sectors] allocated.
4193 */
63ba17d3
VI
4194 if (n > pnum_inter &&
4195 (intermediate == top ||
4196 sector_num + pnum_inter < intermediate->total_sectors)) {
188a7bbf
PB
4197 n = pnum_inter;
4198 }
4199
4200 intermediate = intermediate->backing_hd;
4201 }
4202
4203 *pnum = n;
4204 return 0;
4205}
4206
045df330
AL
4207const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4208{
4209 if (bs->backing_hd && bs->backing_hd->encrypted)
4210 return bs->backing_file;
4211 else if (bs->encrypted)
4212 return bs->filename;
4213 else
4214 return NULL;
4215}
4216
5fafdf24 4217void bdrv_get_backing_filename(BlockDriverState *bs,
83f64091
FB
4218 char *filename, int filename_size)
4219{
3574c608 4220 pstrcpy(filename, filename_size, bs->backing_file);
83f64091
FB
4221}
4222
5fafdf24 4223int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
faea38e7
FB
4224 const uint8_t *buf, int nb_sectors)
4225{
4226 BlockDriver *drv = bs->drv;
4227 if (!drv)
19cb3738 4228 return -ENOMEDIUM;
faea38e7
FB
4229 if (!drv->bdrv_write_compressed)
4230 return -ENOTSUP;
fbb7b4e0
KW
4231 if (bdrv_check_request(bs, sector_num, nb_sectors))
4232 return -EIO;
a55eb92c 4233
e4654d2d 4234 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
a55eb92c 4235
faea38e7
FB
4236 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4237}
3b46e624 4238
faea38e7
FB
4239int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4240{
4241 BlockDriver *drv = bs->drv;
4242 if (!drv)
19cb3738 4243 return -ENOMEDIUM;
faea38e7
FB
4244 if (!drv->bdrv_get_info)
4245 return -ENOTSUP;
4246 memset(bdi, 0, sizeof(*bdi));
4247 return drv->bdrv_get_info(bs, bdi);
4248}
4249
eae041fe
HR
4250ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4251{
4252 BlockDriver *drv = bs->drv;
4253 if (drv && drv->bdrv_get_specific_info) {
4254 return drv->bdrv_get_specific_info(bs);
4255 }
4256 return NULL;
4257}
4258
45566e9c
CH
4259int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4260 int64_t pos, int size)
cf8074b3
KW
4261{
4262 QEMUIOVector qiov;
4263 struct iovec iov = {
4264 .iov_base = (void *) buf,
4265 .iov_len = size,
4266 };
4267
4268 qemu_iovec_init_external(&qiov, &iov, 1);
4269 return bdrv_writev_vmstate(bs, &qiov, pos);
4270}
4271
4272int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
178e08a5
AL
4273{
4274 BlockDriver *drv = bs->drv;
cf8074b3
KW
4275
4276 if (!drv) {
178e08a5 4277 return -ENOMEDIUM;
cf8074b3
KW
4278 } else if (drv->bdrv_save_vmstate) {
4279 return drv->bdrv_save_vmstate(bs, qiov, pos);
4280 } else if (bs->file) {
4281 return bdrv_writev_vmstate(bs->file, qiov, pos);
4282 }
4283
7cdb1f6d 4284 return -ENOTSUP;
178e08a5
AL
4285}
4286
45566e9c
CH
4287int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4288 int64_t pos, int size)
178e08a5
AL
4289{
4290 BlockDriver *drv = bs->drv;
4291 if (!drv)
4292 return -ENOMEDIUM;
7cdb1f6d
MK
4293 if (drv->bdrv_load_vmstate)
4294 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4295 if (bs->file)
4296 return bdrv_load_vmstate(bs->file, buf, pos, size);
4297 return -ENOTSUP;
178e08a5
AL
4298}
4299
8b9b0cc2
KW
4300void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4301{
bf736fe3 4302 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
8b9b0cc2
KW
4303 return;
4304 }
4305
bf736fe3 4306 bs->drv->bdrv_debug_event(bs, event);
41c695c7
KW
4307}
4308
4309int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4310 const char *tag)
4311{
4312 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4313 bs = bs->file;
4314 }
4315
4316 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4317 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4318 }
4319
4320 return -ENOTSUP;
4321}
4322
4cc70e93
FZ
4323int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4324{
4325 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4326 bs = bs->file;
4327 }
4328
4329 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4330 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4331 }
4332
4333 return -ENOTSUP;
4334}
4335
41c695c7
KW
4336int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4337{
938789ea 4338 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
41c695c7
KW
4339 bs = bs->file;
4340 }
8b9b0cc2 4341
41c695c7
KW
4342 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4343 return bs->drv->bdrv_debug_resume(bs, tag);
4344 }
4345
4346 return -ENOTSUP;
4347}
4348
4349bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4350{
4351 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4352 bs = bs->file;
4353 }
4354
4355 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4356 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4357 }
4358
4359 return false;
8b9b0cc2
KW
4360}
4361
199630b6
BS
4362int bdrv_is_snapshot(BlockDriverState *bs)
4363{
4364 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4365}
4366
b1b1d783
JC
4367/* backing_file can either be relative, or absolute, or a protocol. If it is
4368 * relative, it must be relative to the chain. So, passing in bs->filename
4369 * from a BDS as backing_file should not be done, as that may be relative to
4370 * the CWD rather than the chain. */
e8a6bb9c
MT
4371BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4372 const char *backing_file)
4373{
b1b1d783
JC
4374 char *filename_full = NULL;
4375 char *backing_file_full = NULL;
4376 char *filename_tmp = NULL;
4377 int is_protocol = 0;
4378 BlockDriverState *curr_bs = NULL;
4379 BlockDriverState *retval = NULL;
4380
4381 if (!bs || !bs->drv || !backing_file) {
e8a6bb9c
MT
4382 return NULL;
4383 }
4384
b1b1d783
JC
4385 filename_full = g_malloc(PATH_MAX);
4386 backing_file_full = g_malloc(PATH_MAX);
4387 filename_tmp = g_malloc(PATH_MAX);
4388
4389 is_protocol = path_has_protocol(backing_file);
4390
4391 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4392
4393 /* If either of the filename paths is actually a protocol, then
4394 * compare unmodified paths; otherwise make paths relative */
4395 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4396 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4397 retval = curr_bs->backing_hd;
4398 break;
4399 }
e8a6bb9c 4400 } else {
b1b1d783
JC
4401 /* If not an absolute filename path, make it relative to the current
4402 * image's filename path */
4403 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4404 backing_file);
4405
4406 /* We are going to compare absolute pathnames */
4407 if (!realpath(filename_tmp, filename_full)) {
4408 continue;
4409 }
4410
4411 /* We need to make sure the backing filename we are comparing against
4412 * is relative to the current image filename (or absolute) */
4413 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4414 curr_bs->backing_file);
4415
4416 if (!realpath(filename_tmp, backing_file_full)) {
4417 continue;
4418 }
4419
4420 if (strcmp(backing_file_full, filename_full) == 0) {
4421 retval = curr_bs->backing_hd;
4422 break;
4423 }
e8a6bb9c
MT
4424 }
4425 }
4426
b1b1d783
JC
4427 g_free(filename_full);
4428 g_free(backing_file_full);
4429 g_free(filename_tmp);
4430 return retval;
e8a6bb9c
MT
4431}
4432
f198fd1c
BC
4433int bdrv_get_backing_file_depth(BlockDriverState *bs)
4434{
4435 if (!bs->drv) {
4436 return 0;
4437 }
4438
4439 if (!bs->backing_hd) {
4440 return 0;
4441 }
4442
4443 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4444}
4445
ea2384d3 4446/**************************************************************/
83f64091 4447/* async I/Os */
ea2384d3 4448
3b69e4b9 4449BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
f141eafe 4450 QEMUIOVector *qiov, int nb_sectors,
3b69e4b9 4451 BlockDriverCompletionFunc *cb, void *opaque)
83f64091 4452{
bbf0a440
SH
4453 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4454
d20d9b7c 4455 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
8c5873d6 4456 cb, opaque, false);
ea2384d3
FB
4457}
4458
f141eafe
AL
4459BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4460 QEMUIOVector *qiov, int nb_sectors,
4461 BlockDriverCompletionFunc *cb, void *opaque)
ea2384d3 4462{
bbf0a440
SH
4463 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4464
d20d9b7c 4465 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
8c5873d6 4466 cb, opaque, true);
83f64091
FB
4467}
4468
d5ef94d4
PB
4469BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4470 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4471 BlockDriverCompletionFunc *cb, void *opaque)
4472{
4473 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4474
4475 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4476 BDRV_REQ_ZERO_WRITE | flags,
4477 cb, opaque, true);
4478}
4479
40b4f539
KW
4480
4481typedef struct MultiwriteCB {
4482 int error;
4483 int num_requests;
4484 int num_callbacks;
4485 struct {
4486 BlockDriverCompletionFunc *cb;
4487 void *opaque;
4488 QEMUIOVector *free_qiov;
40b4f539
KW
4489 } callbacks[];
4490} MultiwriteCB;
4491
4492static void multiwrite_user_cb(MultiwriteCB *mcb)
4493{
4494 int i;
4495
4496 for (i = 0; i < mcb->num_callbacks; i++) {
4497 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
1e1ea48d
SH
4498 if (mcb->callbacks[i].free_qiov) {
4499 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4500 }
7267c094 4501 g_free(mcb->callbacks[i].free_qiov);
40b4f539
KW
4502 }
4503}
4504
4505static void multiwrite_cb(void *opaque, int ret)
4506{
4507 MultiwriteCB *mcb = opaque;
4508
6d519a5f
SH
4509 trace_multiwrite_cb(mcb, ret);
4510
cb6d3ca0 4511 if (ret < 0 && !mcb->error) {
40b4f539 4512 mcb->error = ret;
40b4f539
KW
4513 }
4514
4515 mcb->num_requests--;
4516 if (mcb->num_requests == 0) {
de189a1b 4517 multiwrite_user_cb(mcb);
7267c094 4518 g_free(mcb);
40b4f539
KW
4519 }
4520}
4521
4522static int multiwrite_req_compare(const void *a, const void *b)
4523{
77be4366
CH
4524 const BlockRequest *req1 = a, *req2 = b;
4525
4526 /*
4527 * Note that we can't simply subtract req2->sector from req1->sector
4528 * here as that could overflow the return value.
4529 */
4530 if (req1->sector > req2->sector) {
4531 return 1;
4532 } else if (req1->sector < req2->sector) {
4533 return -1;
4534 } else {
4535 return 0;
4536 }
40b4f539
KW
4537}
4538
4539/*
4540 * Takes a bunch of requests and tries to merge them. Returns the number of
4541 * requests that remain after merging.
4542 */
4543static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4544 int num_reqs, MultiwriteCB *mcb)
4545{
4546 int i, outidx;
4547
4548 // Sort requests by start sector
4549 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4550
4551 // Check if adjacent requests touch the same clusters. If so, combine them,
4552 // filling up gaps with zero sectors.
4553 outidx = 0;
4554 for (i = 1; i < num_reqs; i++) {
4555 int merge = 0;
4556 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4557
b6a127a1 4558 // Handle exactly sequential writes and overlapping writes.
40b4f539
KW
4559 if (reqs[i].sector <= oldreq_last) {
4560 merge = 1;
4561 }
4562
e2a305fb
CH
4563 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4564 merge = 0;
4565 }
4566
40b4f539
KW
4567 if (merge) {
4568 size_t size;
7267c094 4569 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
40b4f539
KW
4570 qemu_iovec_init(qiov,
4571 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4572
4573 // Add the first request to the merged one. If the requests are
4574 // overlapping, drop the last sectors of the first request.
4575 size = (reqs[i].sector - reqs[outidx].sector) << 9;
1b093c48 4576 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
40b4f539 4577
b6a127a1
PB
4578 // We should need to add any zeros between the two requests
4579 assert (reqs[i].sector <= oldreq_last);
40b4f539
KW
4580
4581 // Add the second request
1b093c48 4582 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
40b4f539 4583
391827eb
SH
4584 // Add tail of first request, if necessary
4585 if (qiov->size < reqs[outidx].qiov->size) {
4586 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4587 reqs[outidx].qiov->size - qiov->size);
4588 }
4589
cbf1dff2 4590 reqs[outidx].nb_sectors = qiov->size >> 9;
40b4f539
KW
4591 reqs[outidx].qiov = qiov;
4592
4593 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4594 } else {
4595 outidx++;
4596 reqs[outidx].sector = reqs[i].sector;
4597 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4598 reqs[outidx].qiov = reqs[i].qiov;
4599 }
4600 }
4601
4602 return outidx + 1;
4603}
4604
4605/*
4606 * Submit multiple AIO write requests at once.
4607 *
4608 * On success, the function returns 0 and all requests in the reqs array have
4609 * been submitted. In error case this function returns -1, and any of the
4610 * requests may or may not be submitted yet. In particular, this means that the
4611 * callback will be called for some of the requests, for others it won't. The
4612 * caller must check the error field of the BlockRequest to wait for the right
4613 * callbacks (if error != 0, no callback will be called).
4614 *
4615 * The implementation may modify the contents of the reqs array, e.g. to merge
4616 * requests. However, the fields opaque and error are left unmodified as they
4617 * are used to signal failure for a single request to the caller.
4618 */
4619int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4620{
40b4f539
KW
4621 MultiwriteCB *mcb;
4622 int i;
4623
301db7c2
RH
4624 /* don't submit writes if we don't have a medium */
4625 if (bs->drv == NULL) {
4626 for (i = 0; i < num_reqs; i++) {
4627 reqs[i].error = -ENOMEDIUM;
4628 }
4629 return -1;
4630 }
4631
40b4f539
KW
4632 if (num_reqs == 0) {
4633 return 0;
4634 }
4635
4636 // Create MultiwriteCB structure
7267c094 4637 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
40b4f539
KW
4638 mcb->num_requests = 0;
4639 mcb->num_callbacks = num_reqs;
4640
4641 for (i = 0; i < num_reqs; i++) {
4642 mcb->callbacks[i].cb = reqs[i].cb;
4643 mcb->callbacks[i].opaque = reqs[i].opaque;
4644 }
4645
4646 // Check for mergable requests
4647 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4648
6d519a5f
SH
4649 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4650
df9309fb
PB
4651 /* Run the aio requests. */
4652 mcb->num_requests = num_reqs;
40b4f539 4653 for (i = 0; i < num_reqs; i++) {
d20d9b7c
PB
4654 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4655 reqs[i].nb_sectors, reqs[i].flags,
4656 multiwrite_cb, mcb,
4657 true);
40b4f539
KW
4658 }
4659
4660 return 0;
40b4f539
KW
4661}
4662
83f64091 4663void bdrv_aio_cancel(BlockDriverAIOCB *acb)
83f64091 4664{
ca5fd113
FZ
4665 qemu_aio_ref(acb);
4666 bdrv_aio_cancel_async(acb);
4667 while (acb->refcnt > 1) {
4668 if (acb->aiocb_info->get_aio_context) {
4669 aio_poll(acb->aiocb_info->get_aio_context(acb), true);
4670 } else if (acb->bs) {
4671 aio_poll(bdrv_get_aio_context(acb->bs), true);
4672 } else {
4673 abort();
02c50efe 4674 }
02c50efe 4675 }
8007429a 4676 qemu_aio_unref(acb);
02c50efe
FZ
4677}
4678
4679/* Async version of aio cancel. The caller is not blocked if the acb implements
4680 * cancel_async, otherwise we do nothing and let the request normally complete.
4681 * In either case the completion callback must be called. */
4682void bdrv_aio_cancel_async(BlockDriverAIOCB *acb)
4683{
4684 if (acb->aiocb_info->cancel_async) {
4685 acb->aiocb_info->cancel_async(acb);
4686 }
83f64091
FB
4687}
4688
4689/**************************************************************/
4690/* async block device emulation */
4691
c16b5a2c
CH
4692typedef struct BlockDriverAIOCBSync {
4693 BlockDriverAIOCB common;
4694 QEMUBH *bh;
4695 int ret;
4696 /* vector translation state */
4697 QEMUIOVector *qiov;
4698 uint8_t *bounce;
4699 int is_write;
4700} BlockDriverAIOCBSync;
4701
d7331bed 4702static const AIOCBInfo bdrv_em_aiocb_info = {
c16b5a2c 4703 .aiocb_size = sizeof(BlockDriverAIOCBSync),
c16b5a2c
CH
4704};
4705
ce1a14dc 4706static void bdrv_aio_bh_cb(void *opaque)
83f64091 4707{
ce1a14dc 4708 BlockDriverAIOCBSync *acb = opaque;
f141eafe 4709
857d4f46 4710 if (!acb->is_write && acb->ret >= 0) {
03396148 4711 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
857d4f46 4712 }
ceb42de8 4713 qemu_vfree(acb->bounce);
ce1a14dc 4714 acb->common.cb(acb->common.opaque, acb->ret);
6a7ad299 4715 qemu_bh_delete(acb->bh);
36afc451 4716 acb->bh = NULL;
8007429a 4717 qemu_aio_unref(acb);
83f64091 4718}
beac80cd 4719
f141eafe
AL
4720static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4721 int64_t sector_num,
4722 QEMUIOVector *qiov,
4723 int nb_sectors,
4724 BlockDriverCompletionFunc *cb,
4725 void *opaque,
4726 int is_write)
4727
83f64091 4728{
ce1a14dc 4729 BlockDriverAIOCBSync *acb;
ce1a14dc 4730
d7331bed 4731 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
f141eafe
AL
4732 acb->is_write = is_write;
4733 acb->qiov = qiov;
857d4f46 4734 acb->bounce = qemu_try_blockalign(bs, qiov->size);
2572b37a 4735 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
f141eafe 4736
857d4f46
KW
4737 if (acb->bounce == NULL) {
4738 acb->ret = -ENOMEM;
4739 } else if (is_write) {
d5e6b161 4740 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
1ed20acf 4741 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
f141eafe 4742 } else {
1ed20acf 4743 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
f141eafe
AL
4744 }
4745
ce1a14dc 4746 qemu_bh_schedule(acb->bh);
f141eafe 4747
ce1a14dc 4748 return &acb->common;
beac80cd
FB
4749}
4750
f141eafe
AL
4751static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4752 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 4753 BlockDriverCompletionFunc *cb, void *opaque)
beac80cd 4754{
f141eafe
AL
4755 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4756}
83f64091 4757
f141eafe
AL
4758static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4759 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4760 BlockDriverCompletionFunc *cb, void *opaque)
4761{
4762 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
beac80cd 4763}
beac80cd 4764
68485420
KW
4765
4766typedef struct BlockDriverAIOCBCoroutine {
4767 BlockDriverAIOCB common;
4768 BlockRequest req;
4769 bool is_write;
d318aea9 4770 bool *done;
68485420
KW
4771 QEMUBH* bh;
4772} BlockDriverAIOCBCoroutine;
4773
d7331bed 4774static const AIOCBInfo bdrv_em_co_aiocb_info = {
68485420 4775 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
68485420
KW
4776};
4777
35246a68 4778static void bdrv_co_em_bh(void *opaque)
68485420
KW
4779{
4780 BlockDriverAIOCBCoroutine *acb = opaque;
4781
4782 acb->common.cb(acb->common.opaque, acb->req.error);
d318aea9 4783
68485420 4784 qemu_bh_delete(acb->bh);
8007429a 4785 qemu_aio_unref(acb);
68485420
KW
4786}
4787
b2a61371
SH
4788/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4789static void coroutine_fn bdrv_co_do_rw(void *opaque)
4790{
4791 BlockDriverAIOCBCoroutine *acb = opaque;
4792 BlockDriverState *bs = acb->common.bs;
4793
4794 if (!acb->is_write) {
4795 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
d20d9b7c 4796 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
b2a61371
SH
4797 } else {
4798 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
d20d9b7c 4799 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
b2a61371
SH
4800 }
4801
2572b37a 4802 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
b2a61371
SH
4803 qemu_bh_schedule(acb->bh);
4804}
4805
68485420
KW
4806static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4807 int64_t sector_num,
4808 QEMUIOVector *qiov,
4809 int nb_sectors,
d20d9b7c 4810 BdrvRequestFlags flags,
68485420
KW
4811 BlockDriverCompletionFunc *cb,
4812 void *opaque,
8c5873d6 4813 bool is_write)
68485420
KW
4814{
4815 Coroutine *co;
4816 BlockDriverAIOCBCoroutine *acb;
4817
d7331bed 4818 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
68485420
KW
4819 acb->req.sector = sector_num;
4820 acb->req.nb_sectors = nb_sectors;
4821 acb->req.qiov = qiov;
d20d9b7c 4822 acb->req.flags = flags;
68485420
KW
4823 acb->is_write = is_write;
4824
8c5873d6 4825 co = qemu_coroutine_create(bdrv_co_do_rw);
68485420
KW
4826 qemu_coroutine_enter(co, acb);
4827
4828 return &acb->common;
4829}
4830
07f07615 4831static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
b2e12bc6 4832{
07f07615
PB
4833 BlockDriverAIOCBCoroutine *acb = opaque;
4834 BlockDriverState *bs = acb->common.bs;
b2e12bc6 4835
07f07615 4836 acb->req.error = bdrv_co_flush(bs);
2572b37a 4837 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
b2e12bc6 4838 qemu_bh_schedule(acb->bh);
b2e12bc6
CH
4839}
4840
07f07615 4841BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
016f5cf6
AG
4842 BlockDriverCompletionFunc *cb, void *opaque)
4843{
07f07615 4844 trace_bdrv_aio_flush(bs, opaque);
016f5cf6 4845
07f07615
PB
4846 Coroutine *co;
4847 BlockDriverAIOCBCoroutine *acb;
016f5cf6 4848
d7331bed 4849 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
d318aea9 4850
07f07615
PB
4851 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4852 qemu_coroutine_enter(co, acb);
016f5cf6 4853
016f5cf6
AG
4854 return &acb->common;
4855}
4856
4265d620
PB
4857static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4858{
4859 BlockDriverAIOCBCoroutine *acb = opaque;
4860 BlockDriverState *bs = acb->common.bs;
4861
4862 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
2572b37a 4863 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4265d620
PB
4864 qemu_bh_schedule(acb->bh);
4865}
4866
4867BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4868 int64_t sector_num, int nb_sectors,
4869 BlockDriverCompletionFunc *cb, void *opaque)
4870{
4871 Coroutine *co;
4872 BlockDriverAIOCBCoroutine *acb;
4873
4874 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4875
d7331bed 4876 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4265d620
PB
4877 acb->req.sector = sector_num;
4878 acb->req.nb_sectors = nb_sectors;
4879 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4880 qemu_coroutine_enter(co, acb);
4881
4882 return &acb->common;
4883}
4884
ea2384d3
FB
4885void bdrv_init(void)
4886{
5efa9d5a 4887 module_call_init(MODULE_INIT_BLOCK);
ea2384d3 4888}
ce1a14dc 4889
eb852011
MA
4890void bdrv_init_with_whitelist(void)
4891{
4892 use_bdrv_whitelist = 1;
4893 bdrv_init();
4894}
4895
d7331bed 4896void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
c16b5a2c 4897 BlockDriverCompletionFunc *cb, void *opaque)
ce1a14dc 4898{
ce1a14dc
PB
4899 BlockDriverAIOCB *acb;
4900
d7331bed
SH
4901 acb = g_slice_alloc(aiocb_info->aiocb_size);
4902 acb->aiocb_info = aiocb_info;
ce1a14dc
PB
4903 acb->bs = bs;
4904 acb->cb = cb;
4905 acb->opaque = opaque;
f197fe2b 4906 acb->refcnt = 1;
ce1a14dc
PB
4907 return acb;
4908}
4909
f197fe2b
FZ
4910void qemu_aio_ref(void *p)
4911{
4912 BlockDriverAIOCB *acb = p;
4913 acb->refcnt++;
4914}
4915
8007429a 4916void qemu_aio_unref(void *p)
ce1a14dc 4917{
d37c975f 4918 BlockDriverAIOCB *acb = p;
f197fe2b
FZ
4919 assert(acb->refcnt > 0);
4920 if (--acb->refcnt == 0) {
4921 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4922 }
ce1a14dc 4923}
19cb3738 4924
f9f05dc5
KW
4925/**************************************************************/
4926/* Coroutine block device emulation */
4927
4928typedef struct CoroutineIOCompletion {
4929 Coroutine *coroutine;
4930 int ret;
4931} CoroutineIOCompletion;
4932
4933static void bdrv_co_io_em_complete(void *opaque, int ret)
4934{
4935 CoroutineIOCompletion *co = opaque;
4936
4937 co->ret = ret;
4938 qemu_coroutine_enter(co->coroutine, NULL);
4939}
4940
4941static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4942 int nb_sectors, QEMUIOVector *iov,
4943 bool is_write)
4944{
4945 CoroutineIOCompletion co = {
4946 .coroutine = qemu_coroutine_self(),
4947 };
4948 BlockDriverAIOCB *acb;
4949
4950 if (is_write) {
a652d160
SH
4951 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4952 bdrv_co_io_em_complete, &co);
f9f05dc5 4953 } else {
a652d160
SH
4954 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4955 bdrv_co_io_em_complete, &co);
f9f05dc5
KW
4956 }
4957
59370aaa 4958 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
f9f05dc5
KW
4959 if (!acb) {
4960 return -EIO;
4961 }
4962 qemu_coroutine_yield();
4963
4964 return co.ret;
4965}
4966
4967static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4968 int64_t sector_num, int nb_sectors,
4969 QEMUIOVector *iov)
4970{
4971 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4972}
4973
4974static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4975 int64_t sector_num, int nb_sectors,
4976 QEMUIOVector *iov)
4977{
4978 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4979}
4980
07f07615 4981static void coroutine_fn bdrv_flush_co_entry(void *opaque)
e7a8a783 4982{
07f07615
PB
4983 RwCo *rwco = opaque;
4984
4985 rwco->ret = bdrv_co_flush(rwco->bs);
4986}
4987
4988int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4989{
eb489bb1
KW
4990 int ret;
4991
29cdb251 4992 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
07f07615 4993 return 0;
eb489bb1
KW
4994 }
4995
ca716364 4996 /* Write back cached data to the OS even with cache=unsafe */
bf736fe3 4997 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
eb489bb1
KW
4998 if (bs->drv->bdrv_co_flush_to_os) {
4999 ret = bs->drv->bdrv_co_flush_to_os(bs);
5000 if (ret < 0) {
5001 return ret;
5002 }
5003 }
5004
ca716364
KW
5005 /* But don't actually force it to the disk with cache=unsafe */
5006 if (bs->open_flags & BDRV_O_NO_FLUSH) {
d4c82329 5007 goto flush_parent;
ca716364
KW
5008 }
5009
bf736fe3 5010 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
eb489bb1 5011 if (bs->drv->bdrv_co_flush_to_disk) {
29cdb251 5012 ret = bs->drv->bdrv_co_flush_to_disk(bs);
07f07615
PB
5013 } else if (bs->drv->bdrv_aio_flush) {
5014 BlockDriverAIOCB *acb;
5015 CoroutineIOCompletion co = {
5016 .coroutine = qemu_coroutine_self(),
5017 };
5018
5019 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
5020 if (acb == NULL) {
29cdb251 5021 ret = -EIO;
07f07615
PB
5022 } else {
5023 qemu_coroutine_yield();
29cdb251 5024 ret = co.ret;
07f07615 5025 }
07f07615
PB
5026 } else {
5027 /*
5028 * Some block drivers always operate in either writethrough or unsafe
5029 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
5030 * know how the server works (because the behaviour is hardcoded or
5031 * depends on server-side configuration), so we can't ensure that
5032 * everything is safe on disk. Returning an error doesn't work because
5033 * that would break guests even if the server operates in writethrough
5034 * mode.
5035 *
5036 * Let's hope the user knows what he's doing.
5037 */
29cdb251 5038 ret = 0;
07f07615 5039 }
29cdb251
PB
5040 if (ret < 0) {
5041 return ret;
5042 }
5043
5044 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
5045 * in the case of cache=unsafe, so there are no useless flushes.
5046 */
d4c82329 5047flush_parent:
29cdb251 5048 return bdrv_co_flush(bs->file);
07f07615
PB
5049}
5050
5a8a30db 5051void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
0f15423c 5052{
5a8a30db
KW
5053 Error *local_err = NULL;
5054 int ret;
5055
3456a8d1
KW
5056 if (!bs->drv) {
5057 return;
5058 }
5059
7ea2d269
AK
5060 if (!(bs->open_flags & BDRV_O_INCOMING)) {
5061 return;
5062 }
5063 bs->open_flags &= ~BDRV_O_INCOMING;
5064
3456a8d1 5065 if (bs->drv->bdrv_invalidate_cache) {
5a8a30db 5066 bs->drv->bdrv_invalidate_cache(bs, &local_err);
3456a8d1 5067 } else if (bs->file) {
5a8a30db
KW
5068 bdrv_invalidate_cache(bs->file, &local_err);
5069 }
5070 if (local_err) {
5071 error_propagate(errp, local_err);
5072 return;
0f15423c 5073 }
3456a8d1 5074
5a8a30db
KW
5075 ret = refresh_total_sectors(bs, bs->total_sectors);
5076 if (ret < 0) {
5077 error_setg_errno(errp, -ret, "Could not refresh total sector count");
5078 return;
5079 }
0f15423c
AL
5080}
5081
5a8a30db 5082void bdrv_invalidate_cache_all(Error **errp)
0f15423c
AL
5083{
5084 BlockDriverState *bs;
5a8a30db 5085 Error *local_err = NULL;
0f15423c 5086
dc364f4c 5087 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
5088 AioContext *aio_context = bdrv_get_aio_context(bs);
5089
5090 aio_context_acquire(aio_context);
5a8a30db 5091 bdrv_invalidate_cache(bs, &local_err);
ed78cda3 5092 aio_context_release(aio_context);
5a8a30db
KW
5093 if (local_err) {
5094 error_propagate(errp, local_err);
5095 return;
5096 }
0f15423c
AL
5097 }
5098}
5099
07f07615
PB
5100int bdrv_flush(BlockDriverState *bs)
5101{
5102 Coroutine *co;
5103 RwCo rwco = {
5104 .bs = bs,
5105 .ret = NOT_DONE,
e7a8a783 5106 };
e7a8a783 5107
07f07615
PB
5108 if (qemu_in_coroutine()) {
5109 /* Fast-path if already in coroutine context */
5110 bdrv_flush_co_entry(&rwco);
5111 } else {
2572b37a
SH
5112 AioContext *aio_context = bdrv_get_aio_context(bs);
5113
07f07615
PB
5114 co = qemu_coroutine_create(bdrv_flush_co_entry);
5115 qemu_coroutine_enter(co, &rwco);
5116 while (rwco.ret == NOT_DONE) {
2572b37a 5117 aio_poll(aio_context, true);
07f07615 5118 }
e7a8a783 5119 }
07f07615
PB
5120
5121 return rwco.ret;
e7a8a783
KW
5122}
5123
775aa8b6
KW
5124typedef struct DiscardCo {
5125 BlockDriverState *bs;
5126 int64_t sector_num;
5127 int nb_sectors;
5128 int ret;
5129} DiscardCo;
4265d620
PB
5130static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5131{
775aa8b6 5132 DiscardCo *rwco = opaque;
4265d620
PB
5133
5134 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5135}
5136
6f14da52
PL
5137/* if no limit is specified in the BlockLimits use a default
5138 * of 32768 512-byte sectors (16 MiB) per request.
5139 */
5140#define MAX_DISCARD_DEFAULT 32768
5141
4265d620
PB
5142int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5143 int nb_sectors)
5144{
d51e9fe5
PB
5145 int max_discard;
5146
4265d620
PB
5147 if (!bs->drv) {
5148 return -ENOMEDIUM;
5149 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5150 return -EIO;
5151 } else if (bs->read_only) {
5152 return -EROFS;
df702c9b
PB
5153 }
5154
e4654d2d 5155 bdrv_reset_dirty(bs, sector_num, nb_sectors);
df702c9b 5156
9e8f1835
PB
5157 /* Do nothing if disabled. */
5158 if (!(bs->open_flags & BDRV_O_UNMAP)) {
5159 return 0;
5160 }
5161
d51e9fe5
PB
5162 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5163 return 0;
5164 }
6f14da52 5165
d51e9fe5
PB
5166 max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5167 while (nb_sectors > 0) {
5168 int ret;
5169 int num = nb_sectors;
6f14da52 5170
d51e9fe5
PB
5171 /* align request */
5172 if (bs->bl.discard_alignment &&
5173 num >= bs->bl.discard_alignment &&
5174 sector_num % bs->bl.discard_alignment) {
5175 if (num > bs->bl.discard_alignment) {
5176 num = bs->bl.discard_alignment;
6f14da52 5177 }
d51e9fe5
PB
5178 num -= sector_num % bs->bl.discard_alignment;
5179 }
6f14da52 5180
d51e9fe5
PB
5181 /* limit request size */
5182 if (num > max_discard) {
5183 num = max_discard;
5184 }
6f14da52 5185
d51e9fe5 5186 if (bs->drv->bdrv_co_discard) {
6f14da52 5187 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
d51e9fe5
PB
5188 } else {
5189 BlockDriverAIOCB *acb;
5190 CoroutineIOCompletion co = {
5191 .coroutine = qemu_coroutine_self(),
5192 };
5193
5194 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5195 bdrv_co_io_em_complete, &co);
5196 if (acb == NULL) {
5197 return -EIO;
5198 } else {
5199 qemu_coroutine_yield();
5200 ret = co.ret;
6f14da52 5201 }
6f14da52 5202 }
7ce21016 5203 if (ret && ret != -ENOTSUP) {
d51e9fe5 5204 return ret;
4265d620 5205 }
d51e9fe5
PB
5206
5207 sector_num += num;
5208 nb_sectors -= num;
4265d620 5209 }
d51e9fe5 5210 return 0;
4265d620
PB
5211}
5212
5213int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5214{
5215 Coroutine *co;
775aa8b6 5216 DiscardCo rwco = {
4265d620
PB
5217 .bs = bs,
5218 .sector_num = sector_num,
5219 .nb_sectors = nb_sectors,
5220 .ret = NOT_DONE,
5221 };
5222
5223 if (qemu_in_coroutine()) {
5224 /* Fast-path if already in coroutine context */
5225 bdrv_discard_co_entry(&rwco);
5226 } else {
2572b37a
SH
5227 AioContext *aio_context = bdrv_get_aio_context(bs);
5228
4265d620
PB
5229 co = qemu_coroutine_create(bdrv_discard_co_entry);
5230 qemu_coroutine_enter(co, &rwco);
5231 while (rwco.ret == NOT_DONE) {
2572b37a 5232 aio_poll(aio_context, true);
4265d620
PB
5233 }
5234 }
5235
5236 return rwco.ret;
5237}
5238
19cb3738
FB
5239/**************************************************************/
5240/* removable device support */
5241
5242/**
5243 * Return TRUE if the media is present
5244 */
5245int bdrv_is_inserted(BlockDriverState *bs)
5246{
5247 BlockDriver *drv = bs->drv;
a1aff5bf 5248
19cb3738
FB
5249 if (!drv)
5250 return 0;
5251 if (!drv->bdrv_is_inserted)
a1aff5bf
MA
5252 return 1;
5253 return drv->bdrv_is_inserted(bs);
19cb3738
FB
5254}
5255
5256/**
8e49ca46
MA
5257 * Return whether the media changed since the last call to this
5258 * function, or -ENOTSUP if we don't know. Most drivers don't know.
19cb3738
FB
5259 */
5260int bdrv_media_changed(BlockDriverState *bs)
5261{
5262 BlockDriver *drv = bs->drv;
19cb3738 5263
8e49ca46
MA
5264 if (drv && drv->bdrv_media_changed) {
5265 return drv->bdrv_media_changed(bs);
5266 }
5267 return -ENOTSUP;
19cb3738
FB
5268}
5269
5270/**
5271 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5272 */
f36f3949 5273void bdrv_eject(BlockDriverState *bs, bool eject_flag)
19cb3738
FB
5274{
5275 BlockDriver *drv = bs->drv;
19cb3738 5276
822e1cd1
MA
5277 if (drv && drv->bdrv_eject) {
5278 drv->bdrv_eject(bs, eject_flag);
19cb3738 5279 }
6f382ed2
LC
5280
5281 if (bs->device_name[0] != '\0') {
a5ee7bd4
WX
5282 qapi_event_send_device_tray_moved(bdrv_get_device_name(bs),
5283 eject_flag, &error_abort);
6f382ed2 5284 }
19cb3738
FB
5285}
5286
19cb3738
FB
5287/**
5288 * Lock or unlock the media (if it is locked, the user won't be able
5289 * to eject it manually).
5290 */
025e849a 5291void bdrv_lock_medium(BlockDriverState *bs, bool locked)
19cb3738
FB
5292{
5293 BlockDriver *drv = bs->drv;
5294
025e849a 5295 trace_bdrv_lock_medium(bs, locked);
b8c6d095 5296
025e849a
MA
5297 if (drv && drv->bdrv_lock_medium) {
5298 drv->bdrv_lock_medium(bs, locked);
19cb3738
FB
5299 }
5300}
985a03b0
TS
5301
5302/* needed for generic scsi interface */
5303
5304int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5305{
5306 BlockDriver *drv = bs->drv;
5307
5308 if (drv && drv->bdrv_ioctl)
5309 return drv->bdrv_ioctl(bs, req, buf);
5310 return -ENOTSUP;
5311}
7d780669 5312
221f715d
AL
5313BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5314 unsigned long int req, void *buf,
5315 BlockDriverCompletionFunc *cb, void *opaque)
7d780669 5316{
221f715d 5317 BlockDriver *drv = bs->drv;
7d780669 5318
221f715d
AL
5319 if (drv && drv->bdrv_aio_ioctl)
5320 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5321 return NULL;
7d780669 5322}
e268ca52 5323
1b7fd729 5324void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
7b6f9300 5325{
1b7fd729 5326 bs->guest_block_size = align;
7b6f9300 5327}
7cd1e32a 5328
e268ca52
AL
5329void *qemu_blockalign(BlockDriverState *bs, size_t size)
5330{
339064d5 5331 return qemu_memalign(bdrv_opt_mem_align(bs), size);
e268ca52 5332}
7cd1e32a 5333
7d2a35cc
KW
5334void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5335{
5336 size_t align = bdrv_opt_mem_align(bs);
5337
5338 /* Ensure that NULL is never returned on success */
5339 assert(align > 0);
5340 if (size == 0) {
5341 size = align;
5342 }
5343
5344 return qemu_try_memalign(align, size);
5345}
5346
c53b1c51
SH
5347/*
5348 * Check if all memory in this vector is sector aligned.
5349 */
5350bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5351{
5352 int i;
339064d5 5353 size_t alignment = bdrv_opt_mem_align(bs);
c53b1c51
SH
5354
5355 for (i = 0; i < qiov->niov; i++) {
339064d5 5356 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
c53b1c51 5357 return false;
1ff735bd 5358 }
339064d5 5359 if (qiov->iov[i].iov_len % alignment) {
1ff735bd 5360 return false;
c53b1c51
SH
5361 }
5362 }
5363
5364 return true;
5365}
5366
b8afb520
FZ
5367BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5368 Error **errp)
7cd1e32a
LS
5369{
5370 int64_t bitmap_size;
e4654d2d 5371 BdrvDirtyBitmap *bitmap;
a55eb92c 5372
50717e94
PB
5373 assert((granularity & (granularity - 1)) == 0);
5374
e4654d2d
FZ
5375 granularity >>= BDRV_SECTOR_BITS;
5376 assert(granularity);
57322b78 5377 bitmap_size = bdrv_nb_sectors(bs);
b8afb520
FZ
5378 if (bitmap_size < 0) {
5379 error_setg_errno(errp, -bitmap_size, "could not get length of device");
5380 errno = -bitmap_size;
5381 return NULL;
5382 }
5839e53b 5383 bitmap = g_new0(BdrvDirtyBitmap, 1);
e4654d2d
FZ
5384 bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5385 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5386 return bitmap;
5387}
5388
5389void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5390{
5391 BdrvDirtyBitmap *bm, *next;
5392 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5393 if (bm == bitmap) {
5394 QLIST_REMOVE(bitmap, list);
5395 hbitmap_free(bitmap->bitmap);
5396 g_free(bitmap);
5397 return;
a55eb92c 5398 }
7cd1e32a
LS
5399 }
5400}
5401
21b56835
FZ
5402BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5403{
5404 BdrvDirtyBitmap *bm;
5405 BlockDirtyInfoList *list = NULL;
5406 BlockDirtyInfoList **plist = &list;
5407
5408 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5839e53b
MA
5409 BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5410 BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
21b56835
FZ
5411 info->count = bdrv_get_dirty_count(bs, bm);
5412 info->granularity =
5413 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5414 entry->value = info;
5415 *plist = entry;
5416 plist = &entry->next;
5417 }
5418
5419 return list;
5420}
5421
e4654d2d 5422int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
7cd1e32a 5423{
e4654d2d
FZ
5424 if (bitmap) {
5425 return hbitmap_get(bitmap->bitmap, sector);
7cd1e32a
LS
5426 } else {
5427 return 0;
5428 }
5429}
5430
e4654d2d
FZ
5431void bdrv_dirty_iter_init(BlockDriverState *bs,
5432 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
1755da16 5433{
e4654d2d 5434 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
1755da16
PB
5435}
5436
5437void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5438 int nr_sectors)
5439{
e4654d2d
FZ
5440 BdrvDirtyBitmap *bitmap;
5441 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5442 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5443 }
1755da16
PB
5444}
5445
e4654d2d 5446void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
7cd1e32a 5447{
e4654d2d
FZ
5448 BdrvDirtyBitmap *bitmap;
5449 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5450 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5451 }
7cd1e32a 5452}
aaa0eb75 5453
e4654d2d 5454int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
aaa0eb75 5455{
e4654d2d 5456 return hbitmap_count(bitmap->bitmap);
aaa0eb75 5457}
f88e1a42 5458
9fcb0251
FZ
5459/* Get a reference to bs */
5460void bdrv_ref(BlockDriverState *bs)
5461{
5462 bs->refcnt++;
5463}
5464
5465/* Release a previously grabbed reference to bs.
5466 * If after releasing, reference count is zero, the BlockDriverState is
5467 * deleted. */
5468void bdrv_unref(BlockDriverState *bs)
5469{
9a4d5ca6
JC
5470 if (!bs) {
5471 return;
5472 }
9fcb0251
FZ
5473 assert(bs->refcnt > 0);
5474 if (--bs->refcnt == 0) {
5475 bdrv_delete(bs);
5476 }
5477}
5478
fbe40ff7
FZ
5479struct BdrvOpBlocker {
5480 Error *reason;
5481 QLIST_ENTRY(BdrvOpBlocker) list;
5482};
5483
5484bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5485{
5486 BdrvOpBlocker *blocker;
5487 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5488 if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5489 blocker = QLIST_FIRST(&bs->op_blockers[op]);
5490 if (errp) {
5491 error_setg(errp, "Device '%s' is busy: %s",
5492 bs->device_name, error_get_pretty(blocker->reason));
5493 }
5494 return true;
5495 }
5496 return false;
5497}
5498
5499void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5500{
5501 BdrvOpBlocker *blocker;
5502 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5503
5839e53b 5504 blocker = g_new0(BdrvOpBlocker, 1);
fbe40ff7
FZ
5505 blocker->reason = reason;
5506 QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5507}
5508
5509void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5510{
5511 BdrvOpBlocker *blocker, *next;
5512 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5513 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5514 if (blocker->reason == reason) {
5515 QLIST_REMOVE(blocker, list);
5516 g_free(blocker);
5517 }
5518 }
5519}
5520
5521void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5522{
5523 int i;
5524 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5525 bdrv_op_block(bs, i, reason);
5526 }
5527}
5528
5529void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5530{
5531 int i;
5532 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5533 bdrv_op_unblock(bs, i, reason);
5534 }
5535}
5536
5537bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5538{
5539 int i;
5540
5541 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5542 if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5543 return false;
5544 }
5545 }
5546 return true;
5547}
5548
28a7282a
LC
5549void bdrv_iostatus_enable(BlockDriverState *bs)
5550{
d6bf279e 5551 bs->iostatus_enabled = true;
58e21ef5 5552 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
28a7282a
LC
5553}
5554
5555/* The I/O status is only enabled if the drive explicitly
5556 * enables it _and_ the VM is configured to stop on errors */
5557bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5558{
d6bf279e 5559 return (bs->iostatus_enabled &&
92aa5c6d
PB
5560 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5561 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5562 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
28a7282a
LC
5563}
5564
5565void bdrv_iostatus_disable(BlockDriverState *bs)
5566{
d6bf279e 5567 bs->iostatus_enabled = false;
28a7282a
LC
5568}
5569
5570void bdrv_iostatus_reset(BlockDriverState *bs)
5571{
5572 if (bdrv_iostatus_is_enabled(bs)) {
58e21ef5 5573 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3bd293c3
PB
5574 if (bs->job) {
5575 block_job_iostatus_reset(bs->job);
5576 }
28a7282a
LC
5577 }
5578}
5579
28a7282a
LC
5580void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5581{
3e1caa5f
PB
5582 assert(bdrv_iostatus_is_enabled(bs));
5583 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
58e21ef5
LC
5584 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5585 BLOCK_DEVICE_IO_STATUS_FAILED;
28a7282a
LC
5586 }
5587}
5588
d92ada22
LC
5589void bdrv_img_create(const char *filename, const char *fmt,
5590 const char *base_filename, const char *base_fmt,
f382d43a
MR
5591 char *options, uint64_t img_size, int flags,
5592 Error **errp, bool quiet)
f88e1a42 5593{
83d0521a
CL
5594 QemuOptsList *create_opts = NULL;
5595 QemuOpts *opts = NULL;
5596 const char *backing_fmt, *backing_file;
5597 int64_t size;
f88e1a42 5598 BlockDriver *drv, *proto_drv;
96df67d1 5599 BlockDriver *backing_drv = NULL;
cc84d90f 5600 Error *local_err = NULL;
f88e1a42
JS
5601 int ret = 0;
5602
5603 /* Find driver and parse its options */
5604 drv = bdrv_find_format(fmt);
5605 if (!drv) {
71c79813 5606 error_setg(errp, "Unknown file format '%s'", fmt);
d92ada22 5607 return;
f88e1a42
JS
5608 }
5609
98289620 5610 proto_drv = bdrv_find_protocol(filename, true);
f88e1a42 5611 if (!proto_drv) {
71c79813 5612 error_setg(errp, "Unknown protocol '%s'", filename);
d92ada22 5613 return;
f88e1a42
JS
5614 }
5615
c282e1fd
CL
5616 create_opts = qemu_opts_append(create_opts, drv->create_opts);
5617 create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
f88e1a42
JS
5618
5619 /* Create parameter list with default values */
83d0521a
CL
5620 opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5621 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size);
f88e1a42
JS
5622
5623 /* Parse -o options */
5624 if (options) {
83d0521a
CL
5625 if (qemu_opts_do_parse(opts, options, NULL) != 0) {
5626 error_setg(errp, "Invalid options for file format '%s'", fmt);
f88e1a42
JS
5627 goto out;
5628 }
5629 }
5630
5631 if (base_filename) {
83d0521a 5632 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename)) {
71c79813
LC
5633 error_setg(errp, "Backing file not supported for file format '%s'",
5634 fmt);
f88e1a42
JS
5635 goto out;
5636 }
5637 }
5638
5639 if (base_fmt) {
83d0521a 5640 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt)) {
71c79813
LC
5641 error_setg(errp, "Backing file format not supported for file "
5642 "format '%s'", fmt);
f88e1a42
JS
5643 goto out;
5644 }
5645 }
5646
83d0521a
CL
5647 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5648 if (backing_file) {
5649 if (!strcmp(filename, backing_file)) {
71c79813
LC
5650 error_setg(errp, "Error: Trying to create an image with the "
5651 "same filename as the backing file");
792da93a
JS
5652 goto out;
5653 }
5654 }
5655
83d0521a
CL
5656 backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5657 if (backing_fmt) {
5658 backing_drv = bdrv_find_format(backing_fmt);
96df67d1 5659 if (!backing_drv) {
71c79813 5660 error_setg(errp, "Unknown backing file format '%s'",
83d0521a 5661 backing_fmt);
f88e1a42
JS
5662 goto out;
5663 }
5664 }
5665
5666 // The size for the image must always be specified, with one exception:
5667 // If we are using a backing file, we can obtain the size from there
83d0521a
CL
5668 size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5669 if (size == -1) {
5670 if (backing_file) {
66f6b814 5671 BlockDriverState *bs;
52bf1e72 5672 int64_t size;
63090dac
PB
5673 int back_flags;
5674
5675 /* backing files always opened read-only */
5676 back_flags =
5677 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
f88e1a42 5678
f67503e5 5679 bs = NULL;
83d0521a 5680 ret = bdrv_open(&bs, backing_file, NULL, NULL, back_flags,
cc84d90f 5681 backing_drv, &local_err);
f88e1a42 5682 if (ret < 0) {
cc84d90f 5683 error_setg_errno(errp, -ret, "Could not open '%s': %s",
83d0521a 5684 backing_file,
cc84d90f
HR
5685 error_get_pretty(local_err));
5686 error_free(local_err);
5687 local_err = NULL;
f88e1a42
JS
5688 goto out;
5689 }
52bf1e72
MA
5690 size = bdrv_getlength(bs);
5691 if (size < 0) {
5692 error_setg_errno(errp, -size, "Could not get size of '%s'",
5693 backing_file);
5694 bdrv_unref(bs);
5695 goto out;
5696 }
f88e1a42 5697
83d0521a 5698 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size);
66f6b814
HR
5699
5700 bdrv_unref(bs);
f88e1a42 5701 } else {
71c79813 5702 error_setg(errp, "Image creation needs a size parameter");
f88e1a42
JS
5703 goto out;
5704 }
5705 }
5706
f382d43a
MR
5707 if (!quiet) {
5708 printf("Formatting '%s', fmt=%s ", filename, fmt);
83d0521a 5709 qemu_opts_print(opts);
f382d43a
MR
5710 puts("");
5711 }
83d0521a 5712
c282e1fd 5713 ret = bdrv_create(drv, filename, opts, &local_err);
83d0521a 5714
cc84d90f
HR
5715 if (ret == -EFBIG) {
5716 /* This is generally a better message than whatever the driver would
5717 * deliver (especially because of the cluster_size_hint), since that
5718 * is most probably not much different from "image too large". */
5719 const char *cluster_size_hint = "";
83d0521a 5720 if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
cc84d90f 5721 cluster_size_hint = " (try using a larger cluster size)";
f88e1a42 5722 }
cc84d90f
HR
5723 error_setg(errp, "The image size is too large for file format '%s'"
5724 "%s", fmt, cluster_size_hint);
5725 error_free(local_err);
5726 local_err = NULL;
f88e1a42
JS
5727 }
5728
5729out:
83d0521a
CL
5730 qemu_opts_del(opts);
5731 qemu_opts_free(create_opts);
84d18f06 5732 if (local_err) {
cc84d90f
HR
5733 error_propagate(errp, local_err);
5734 }
f88e1a42 5735}
85d126f3
SH
5736
5737AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5738{
dcd04228
SH
5739 return bs->aio_context;
5740}
5741
5742void bdrv_detach_aio_context(BlockDriverState *bs)
5743{
33384421
HR
5744 BdrvAioNotifier *baf;
5745
dcd04228
SH
5746 if (!bs->drv) {
5747 return;
5748 }
5749
33384421
HR
5750 QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
5751 baf->detach_aio_context(baf->opaque);
5752 }
5753
13af91eb
SH
5754 if (bs->io_limits_enabled) {
5755 throttle_detach_aio_context(&bs->throttle_state);
5756 }
dcd04228
SH
5757 if (bs->drv->bdrv_detach_aio_context) {
5758 bs->drv->bdrv_detach_aio_context(bs);
5759 }
5760 if (bs->file) {
5761 bdrv_detach_aio_context(bs->file);
5762 }
5763 if (bs->backing_hd) {
5764 bdrv_detach_aio_context(bs->backing_hd);
5765 }
5766
5767 bs->aio_context = NULL;
5768}
5769
5770void bdrv_attach_aio_context(BlockDriverState *bs,
5771 AioContext *new_context)
5772{
33384421
HR
5773 BdrvAioNotifier *ban;
5774
dcd04228
SH
5775 if (!bs->drv) {
5776 return;
5777 }
5778
5779 bs->aio_context = new_context;
5780
5781 if (bs->backing_hd) {
5782 bdrv_attach_aio_context(bs->backing_hd, new_context);
5783 }
5784 if (bs->file) {
5785 bdrv_attach_aio_context(bs->file, new_context);
5786 }
5787 if (bs->drv->bdrv_attach_aio_context) {
5788 bs->drv->bdrv_attach_aio_context(bs, new_context);
5789 }
13af91eb
SH
5790 if (bs->io_limits_enabled) {
5791 throttle_attach_aio_context(&bs->throttle_state, new_context);
5792 }
33384421
HR
5793
5794 QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
5795 ban->attached_aio_context(new_context, ban->opaque);
5796 }
dcd04228
SH
5797}
5798
5799void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5800{
5801 bdrv_drain_all(); /* ensure there are no in-flight requests */
5802
5803 bdrv_detach_aio_context(bs);
5804
5805 /* This function executes in the old AioContext so acquire the new one in
5806 * case it runs in a different thread.
5807 */
5808 aio_context_acquire(new_context);
5809 bdrv_attach_aio_context(bs, new_context);
5810 aio_context_release(new_context);
85d126f3 5811}
d616b224 5812
33384421
HR
5813void bdrv_add_aio_context_notifier(BlockDriverState *bs,
5814 void (*attached_aio_context)(AioContext *new_context, void *opaque),
5815 void (*detach_aio_context)(void *opaque), void *opaque)
5816{
5817 BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
5818 *ban = (BdrvAioNotifier){
5819 .attached_aio_context = attached_aio_context,
5820 .detach_aio_context = detach_aio_context,
5821 .opaque = opaque
5822 };
5823
5824 QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
5825}
5826
5827void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
5828 void (*attached_aio_context)(AioContext *,
5829 void *),
5830 void (*detach_aio_context)(void *),
5831 void *opaque)
5832{
5833 BdrvAioNotifier *ban, *ban_next;
5834
5835 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
5836 if (ban->attached_aio_context == attached_aio_context &&
5837 ban->detach_aio_context == detach_aio_context &&
5838 ban->opaque == opaque)
5839 {
5840 QLIST_REMOVE(ban, list);
5841 g_free(ban);
5842
5843 return;
5844 }
5845 }
5846
5847 abort();
5848}
5849
d616b224
SH
5850void bdrv_add_before_write_notifier(BlockDriverState *bs,
5851 NotifierWithReturn *notifier)
5852{
5853 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5854}
6f176b48 5855
c282e1fd 5856int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts)
6f176b48 5857{
c282e1fd 5858 if (!bs->drv->bdrv_amend_options) {
6f176b48
HR
5859 return -ENOTSUP;
5860 }
c282e1fd 5861 return bs->drv->bdrv_amend_options(bs, opts);
6f176b48 5862}
f6186f49 5863
b5042a36
BC
5864/* This function will be called by the bdrv_recurse_is_first_non_filter method
5865 * of block filter and by bdrv_is_first_non_filter.
5866 * It is used to test if the given bs is the candidate or recurse more in the
5867 * node graph.
212a5a8f 5868 */
b5042a36 5869bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
212a5a8f 5870 BlockDriverState *candidate)
f6186f49 5871{
b5042a36
BC
5872 /* return false if basic checks fails */
5873 if (!bs || !bs->drv) {
212a5a8f 5874 return false;
f6186f49
BC
5875 }
5876
b5042a36
BC
5877 /* the code reached a non block filter driver -> check if the bs is
5878 * the same as the candidate. It's the recursion termination condition.
5879 */
5880 if (!bs->drv->is_filter) {
5881 return bs == candidate;
212a5a8f 5882 }
b5042a36 5883 /* Down this path the driver is a block filter driver */
212a5a8f 5884
b5042a36
BC
5885 /* If the block filter recursion method is defined use it to recurse down
5886 * the node graph.
5887 */
5888 if (bs->drv->bdrv_recurse_is_first_non_filter) {
212a5a8f 5889 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
f6186f49
BC
5890 }
5891
b5042a36
BC
5892 /* the driver is a block filter but don't allow to recurse -> return false
5893 */
5894 return false;
f6186f49
BC
5895}
5896
212a5a8f
BC
5897/* This function checks if the candidate is the first non filter bs down it's
5898 * bs chain. Since we don't have pointers to parents it explore all bs chains
5899 * from the top. Some filters can choose not to pass down the recursion.
5900 */
5901bool bdrv_is_first_non_filter(BlockDriverState *candidate)
f6186f49 5902{
212a5a8f
BC
5903 BlockDriverState *bs;
5904
5905 /* walk down the bs forest recursively */
5906 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5907 bool perm;
5908
b5042a36 5909 /* try to recurse in this top level bs */
e6dc8a1f 5910 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
212a5a8f
BC
5911
5912 /* candidate is the first non filter */
5913 if (perm) {
5914 return true;
5915 }
5916 }
5917
5918 return false;
f6186f49 5919}
09158f00
BC
5920
5921BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
5922{
5923 BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
5924 if (!to_replace_bs) {
5925 error_setg(errp, "Node name '%s' not found", node_name);
5926 return NULL;
5927 }
5928
5929 if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
5930 return NULL;
5931 }
5932
5933 /* We don't want arbitrary node of the BDS chain to be replaced only the top
5934 * most non filter in order to prevent data corruption.
5935 * Another benefit is that this tests exclude backing files which are
5936 * blocked by the backing blockers.
5937 */
5938 if (!bdrv_is_first_non_filter(to_replace_bs)) {
5939 error_setg(errp, "Only top most non filter can be replaced");
5940 return NULL;
5941 }
5942
5943 return to_replace_bs;
5944}
448ad91d
ML
5945
5946void bdrv_io_plug(BlockDriverState *bs)
5947{
5948 BlockDriver *drv = bs->drv;
5949 if (drv && drv->bdrv_io_plug) {
5950 drv->bdrv_io_plug(bs);
5951 } else if (bs->file) {
5952 bdrv_io_plug(bs->file);
5953 }
5954}
5955
5956void bdrv_io_unplug(BlockDriverState *bs)
5957{
5958 BlockDriver *drv = bs->drv;
5959 if (drv && drv->bdrv_io_unplug) {
5960 drv->bdrv_io_unplug(bs);
5961 } else if (bs->file) {
5962 bdrv_io_unplug(bs->file);
5963 }
5964}
5965
5966void bdrv_flush_io_queue(BlockDriverState *bs)
5967{
5968 BlockDriver *drv = bs->drv;
5969 if (drv && drv->bdrv_flush_io_queue) {
5970 drv->bdrv_flush_io_queue(bs);
5971 } else if (bs->file) {
5972 bdrv_flush_io_queue(bs->file);
5973 }
5974}
91af7014
HR
5975
5976static bool append_open_options(QDict *d, BlockDriverState *bs)
5977{
5978 const QDictEntry *entry;
5979 bool found_any = false;
5980
5981 for (entry = qdict_first(bs->options); entry;
5982 entry = qdict_next(bs->options, entry))
5983 {
5984 /* Only take options for this level and exclude all non-driver-specific
5985 * options */
5986 if (!strchr(qdict_entry_key(entry), '.') &&
5987 strcmp(qdict_entry_key(entry), "node-name"))
5988 {
5989 qobject_incref(qdict_entry_value(entry));
5990 qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
5991 found_any = true;
5992 }
5993 }
5994
5995 return found_any;
5996}
5997
5998/* Updates the following BDS fields:
5999 * - exact_filename: A filename which may be used for opening a block device
6000 * which (mostly) equals the given BDS (even without any
6001 * other options; so reading and writing must return the same
6002 * results, but caching etc. may be different)
6003 * - full_open_options: Options which, when given when opening a block device
6004 * (without a filename), result in a BDS (mostly)
6005 * equalling the given one
6006 * - filename: If exact_filename is set, it is copied here. Otherwise,
6007 * full_open_options is converted to a JSON object, prefixed with
6008 * "json:" (for use through the JSON pseudo protocol) and put here.
6009 */
6010void bdrv_refresh_filename(BlockDriverState *bs)
6011{
6012 BlockDriver *drv = bs->drv;
6013 QDict *opts;
6014
6015 if (!drv) {
6016 return;
6017 }
6018
6019 /* This BDS's file name will most probably depend on its file's name, so
6020 * refresh that first */
6021 if (bs->file) {
6022 bdrv_refresh_filename(bs->file);
6023 }
6024
6025 if (drv->bdrv_refresh_filename) {
6026 /* Obsolete information is of no use here, so drop the old file name
6027 * information before refreshing it */
6028 bs->exact_filename[0] = '\0';
6029 if (bs->full_open_options) {
6030 QDECREF(bs->full_open_options);
6031 bs->full_open_options = NULL;
6032 }
6033
6034 drv->bdrv_refresh_filename(bs);
6035 } else if (bs->file) {
6036 /* Try to reconstruct valid information from the underlying file */
6037 bool has_open_options;
6038
6039 bs->exact_filename[0] = '\0';
6040 if (bs->full_open_options) {
6041 QDECREF(bs->full_open_options);
6042 bs->full_open_options = NULL;
6043 }
6044
6045 opts = qdict_new();
6046 has_open_options = append_open_options(opts, bs);
6047
6048 /* If no specific options have been given for this BDS, the filename of
6049 * the underlying file should suffice for this one as well */
6050 if (bs->file->exact_filename[0] && !has_open_options) {
6051 strcpy(bs->exact_filename, bs->file->exact_filename);
6052 }
6053 /* Reconstructing the full options QDict is simple for most format block
6054 * drivers, as long as the full options are known for the underlying
6055 * file BDS. The full options QDict of that file BDS should somehow
6056 * contain a representation of the filename, therefore the following
6057 * suffices without querying the (exact_)filename of this BDS. */
6058 if (bs->file->full_open_options) {
6059 qdict_put_obj(opts, "driver",
6060 QOBJECT(qstring_from_str(drv->format_name)));
6061 QINCREF(bs->file->full_open_options);
6062 qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
6063
6064 bs->full_open_options = opts;
6065 } else {
6066 QDECREF(opts);
6067 }
6068 } else if (!bs->full_open_options && qdict_size(bs->options)) {
6069 /* There is no underlying file BDS (at least referenced by BDS.file),
6070 * so the full options QDict should be equal to the options given
6071 * specifically for this block device when it was opened (plus the
6072 * driver specification).
6073 * Because those options don't change, there is no need to update
6074 * full_open_options when it's already set. */
6075
6076 opts = qdict_new();
6077 append_open_options(opts, bs);
6078 qdict_put_obj(opts, "driver",
6079 QOBJECT(qstring_from_str(drv->format_name)));
6080
6081 if (bs->exact_filename[0]) {
6082 /* This may not work for all block protocol drivers (some may
6083 * require this filename to be parsed), but we have to find some
6084 * default solution here, so just include it. If some block driver
6085 * does not support pure options without any filename at all or
6086 * needs some special format of the options QDict, it needs to
6087 * implement the driver-specific bdrv_refresh_filename() function.
6088 */
6089 qdict_put_obj(opts, "filename",
6090 QOBJECT(qstring_from_str(bs->exact_filename)));
6091 }
6092
6093 bs->full_open_options = opts;
6094 }
6095
6096 if (bs->exact_filename[0]) {
6097 pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
6098 } else if (bs->full_open_options) {
6099 QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6100 snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6101 qstring_get_str(json));
6102 QDECREF(json);
6103 }
6104}
5366d0c8
BC
6105
6106/* This accessor function purpose is to allow the device models to access the
6107 * BlockAcctStats structure embedded inside a BlockDriverState without being
6108 * aware of the BlockDriverState structure layout.
6109 * It will go away when the BlockAcctStats structure will be moved inside
6110 * the device models.
6111 */
6112BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
6113{
6114 return &bs->stats;
6115}