]> git.proxmox.com Git - mirror_qemu.git/blame - block.c
hw/input/tsc210x.c: Delete unused array tsc2101_rates
[mirror_qemu.git] / block.c
CommitLineData
fc01f7e7
FB
1/*
2 * QEMU System Emulator block driver
5fafdf24 3 *
fc01f7e7 4 * Copyright (c) 2003 Fabrice Bellard
5fafdf24 5 *
fc01f7e7
FB
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
3990d09a 24#include "config-host.h"
faf07963 25#include "qemu-common.h"
6d519a5f 26#include "trace.h"
737e150e
PB
27#include "block/block_int.h"
28#include "block/blockjob.h"
1de7afc9 29#include "qemu/module.h"
7b1b5d19 30#include "qapi/qmp/qjson.h"
9c17d615 31#include "sysemu/sysemu.h"
3ae59580 32#include "sysemu/blockdev.h" /* FIXME layering violation */
1de7afc9 33#include "qemu/notify.h"
737e150e 34#include "block/coroutine.h"
c13163fb 35#include "block/qapi.h"
b2023818 36#include "qmp-commands.h"
1de7afc9 37#include "qemu/timer.h"
a5ee7bd4 38#include "qapi-event.h"
fc01f7e7 39
71e72a19 40#ifdef CONFIG_BSD
7674e7bf
FB
41#include <sys/types.h>
42#include <sys/stat.h>
43#include <sys/ioctl.h>
72cf2d4f 44#include <sys/queue.h>
c5e97233 45#ifndef __DragonFly__
7674e7bf
FB
46#include <sys/disk.h>
47#endif
c5e97233 48#endif
7674e7bf 49
49dc768d
AL
50#ifdef _WIN32
51#include <windows.h>
52#endif
53
e4654d2d
FZ
54struct BdrvDirtyBitmap {
55 HBitmap *bitmap;
56 QLIST_ENTRY(BdrvDirtyBitmap) list;
57};
58
1c9805a3
SH
59#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
60
2a87151f
SH
61#define COROUTINE_POOL_RESERVATION 64 /* number of coroutines to reserve */
62
7d4b4ba5 63static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
f141eafe
AL
64static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
65 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
c87c0672 66 BlockDriverCompletionFunc *cb, void *opaque);
f141eafe
AL
67static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
68 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 69 BlockDriverCompletionFunc *cb, void *opaque);
f9f05dc5
KW
70static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
71 int64_t sector_num, int nb_sectors,
72 QEMUIOVector *iov);
73static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
74 int64_t sector_num, int nb_sectors,
75 QEMUIOVector *iov);
775aa8b6
KW
76static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
77 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
470c0504 78 BdrvRequestFlags flags);
775aa8b6
KW
79static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
80 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
f08f2dda 81 BdrvRequestFlags flags);
b2a61371
SH
82static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
83 int64_t sector_num,
84 QEMUIOVector *qiov,
85 int nb_sectors,
d20d9b7c 86 BdrvRequestFlags flags,
b2a61371
SH
87 BlockDriverCompletionFunc *cb,
88 void *opaque,
8c5873d6 89 bool is_write);
b2a61371 90static void coroutine_fn bdrv_co_do_rw(void *opaque);
621f0589 91static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
aa7bfbff 92 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
ec530c81 93
1b7bdbc1
SH
94static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
95 QTAILQ_HEAD_INITIALIZER(bdrv_states);
7ee930d0 96
dc364f4c
BC
97static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
98 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
99
8a22f02a
SH
100static QLIST_HEAD(, BlockDriver) bdrv_drivers =
101 QLIST_HEAD_INITIALIZER(bdrv_drivers);
ea2384d3 102
eb852011
MA
103/* If non-zero, use only whitelisted block drivers */
104static int use_bdrv_whitelist;
105
9e0b22f4
SH
106#ifdef _WIN32
107static int is_windows_drive_prefix(const char *filename)
108{
109 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
110 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
111 filename[1] == ':');
112}
113
114int is_windows_drive(const char *filename)
115{
116 if (is_windows_drive_prefix(filename) &&
117 filename[2] == '\0')
118 return 1;
119 if (strstart(filename, "\\\\.\\", NULL) ||
120 strstart(filename, "//./", NULL))
121 return 1;
122 return 0;
123}
124#endif
125
0563e191 126/* throttling disk I/O limits */
cc0681c4
BC
127void bdrv_set_io_limits(BlockDriverState *bs,
128 ThrottleConfig *cfg)
98f90dba 129{
cc0681c4 130 int i;
98f90dba 131
cc0681c4 132 throttle_config(&bs->throttle_state, cfg);
98f90dba 133
cc0681c4
BC
134 for (i = 0; i < 2; i++) {
135 qemu_co_enter_next(&bs->throttled_reqs[i]);
98f90dba 136 }
cc0681c4
BC
137}
138
139/* this function drain all the throttled IOs */
140static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
141{
142 bool drained = false;
143 bool enabled = bs->io_limits_enabled;
144 int i;
145
146 bs->io_limits_enabled = false;
147
148 for (i = 0; i < 2; i++) {
149 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
150 drained = true;
151 }
152 }
153
154 bs->io_limits_enabled = enabled;
98f90dba 155
cc0681c4 156 return drained;
98f90dba
ZYW
157}
158
cc0681c4 159void bdrv_io_limits_disable(BlockDriverState *bs)
0563e191 160{
cc0681c4 161 bs->io_limits_enabled = false;
0563e191 162
cc0681c4
BC
163 bdrv_start_throttled_reqs(bs);
164
165 throttle_destroy(&bs->throttle_state);
0563e191
ZYW
166}
167
cc0681c4 168static void bdrv_throttle_read_timer_cb(void *opaque)
0563e191 169{
cc0681c4
BC
170 BlockDriverState *bs = opaque;
171 qemu_co_enter_next(&bs->throttled_reqs[0]);
0563e191
ZYW
172}
173
cc0681c4 174static void bdrv_throttle_write_timer_cb(void *opaque)
0563e191 175{
cc0681c4
BC
176 BlockDriverState *bs = opaque;
177 qemu_co_enter_next(&bs->throttled_reqs[1]);
0563e191
ZYW
178}
179
cc0681c4
BC
180/* should be called before bdrv_set_io_limits if a limit is set */
181void bdrv_io_limits_enable(BlockDriverState *bs)
182{
183 assert(!bs->io_limits_enabled);
184 throttle_init(&bs->throttle_state,
13af91eb 185 bdrv_get_aio_context(bs),
cc0681c4
BC
186 QEMU_CLOCK_VIRTUAL,
187 bdrv_throttle_read_timer_cb,
188 bdrv_throttle_write_timer_cb,
189 bs);
190 bs->io_limits_enabled = true;
191}
192
193/* This function makes an IO wait if needed
194 *
195 * @nb_sectors: the number of sectors of the IO
196 * @is_write: is the IO a write
197 */
98f90dba 198static void bdrv_io_limits_intercept(BlockDriverState *bs,
d5103588 199 unsigned int bytes,
cc0681c4 200 bool is_write)
98f90dba 201{
cc0681c4
BC
202 /* does this io must wait */
203 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
98f90dba 204
cc0681c4
BC
205 /* if must wait or any request of this type throttled queue the IO */
206 if (must_wait ||
207 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
208 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
98f90dba
ZYW
209 }
210
cc0681c4 211 /* the IO will be executed, do the accounting */
d5103588
KW
212 throttle_account(&bs->throttle_state, is_write, bytes);
213
98f90dba 214
cc0681c4
BC
215 /* if the next request must wait -> do nothing */
216 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
217 return;
98f90dba
ZYW
218 }
219
cc0681c4
BC
220 /* else queue next request for execution */
221 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
98f90dba
ZYW
222}
223
339064d5
KW
224size_t bdrv_opt_mem_align(BlockDriverState *bs)
225{
226 if (!bs || !bs->drv) {
227 /* 4k should be on the safe side */
228 return 4096;
229 }
230
231 return bs->bl.opt_mem_alignment;
232}
233
9e0b22f4
SH
234/* check if the path starts with "<protocol>:" */
235static int path_has_protocol(const char *path)
236{
947995c0
PB
237 const char *p;
238
9e0b22f4
SH
239#ifdef _WIN32
240 if (is_windows_drive(path) ||
241 is_windows_drive_prefix(path)) {
242 return 0;
243 }
947995c0
PB
244 p = path + strcspn(path, ":/\\");
245#else
246 p = path + strcspn(path, ":/");
9e0b22f4
SH
247#endif
248
947995c0 249 return *p == ':';
9e0b22f4
SH
250}
251
83f64091 252int path_is_absolute(const char *path)
3b0d4f61 253{
21664424
FB
254#ifdef _WIN32
255 /* specific case for names like: "\\.\d:" */
f53f4da9 256 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
21664424 257 return 1;
f53f4da9
PB
258 }
259 return (*path == '/' || *path == '\\');
3b9f94e1 260#else
f53f4da9 261 return (*path == '/');
3b9f94e1 262#endif
3b0d4f61
FB
263}
264
83f64091
FB
265/* if filename is absolute, just copy it to dest. Otherwise, build a
266 path to it by considering it is relative to base_path. URL are
267 supported. */
268void path_combine(char *dest, int dest_size,
269 const char *base_path,
270 const char *filename)
3b0d4f61 271{
83f64091
FB
272 const char *p, *p1;
273 int len;
274
275 if (dest_size <= 0)
276 return;
277 if (path_is_absolute(filename)) {
278 pstrcpy(dest, dest_size, filename);
279 } else {
280 p = strchr(base_path, ':');
281 if (p)
282 p++;
283 else
284 p = base_path;
3b9f94e1
FB
285 p1 = strrchr(base_path, '/');
286#ifdef _WIN32
287 {
288 const char *p2;
289 p2 = strrchr(base_path, '\\');
290 if (!p1 || p2 > p1)
291 p1 = p2;
292 }
293#endif
83f64091
FB
294 if (p1)
295 p1++;
296 else
297 p1 = base_path;
298 if (p1 > p)
299 p = p1;
300 len = p - base_path;
301 if (len > dest_size - 1)
302 len = dest_size - 1;
303 memcpy(dest, base_path, len);
304 dest[len] = '\0';
305 pstrcat(dest, dest_size, filename);
3b0d4f61 306 }
3b0d4f61
FB
307}
308
dc5a1371
PB
309void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
310{
311 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
312 pstrcpy(dest, sz, bs->backing_file);
313 } else {
314 path_combine(dest, sz, bs->filename, bs->backing_file);
315 }
316}
317
5efa9d5a 318void bdrv_register(BlockDriver *bdrv)
ea2384d3 319{
8c5873d6
SH
320 /* Block drivers without coroutine functions need emulation */
321 if (!bdrv->bdrv_co_readv) {
f9f05dc5
KW
322 bdrv->bdrv_co_readv = bdrv_co_readv_em;
323 bdrv->bdrv_co_writev = bdrv_co_writev_em;
324
f8c35c1d
SH
325 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
326 * the block driver lacks aio we need to emulate that too.
327 */
f9f05dc5
KW
328 if (!bdrv->bdrv_aio_readv) {
329 /* add AIO emulation layer */
330 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
331 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
f9f05dc5 332 }
83f64091 333 }
b2e12bc6 334
8a22f02a 335 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
ea2384d3 336}
b338082b 337
9aebf3b8
KW
338static bool bdrv_is_valid_name(const char *name)
339{
340 return qemu_opts_id_wellformed(name);
341}
342
b338082b 343/* create a new block device (by default it is empty) */
98522f63 344BlockDriverState *bdrv_new(const char *device_name, Error **errp)
b338082b 345{
1b7bdbc1 346 BlockDriverState *bs;
fbe40ff7 347 int i;
b338082b 348
9aebf3b8
KW
349 if (*device_name && !bdrv_is_valid_name(device_name)) {
350 error_setg(errp, "Invalid device name");
351 return NULL;
352 }
353
f2d953ec
KW
354 if (bdrv_find(device_name)) {
355 error_setg(errp, "Device with id '%s' already exists",
356 device_name);
357 return NULL;
358 }
359 if (bdrv_find_node(device_name)) {
d224469d
MA
360 error_setg(errp,
361 "Device name '%s' conflicts with an existing node name",
f2d953ec
KW
362 device_name);
363 return NULL;
364 }
365
5839e53b 366 bs = g_new0(BlockDriverState, 1);
e4654d2d 367 QLIST_INIT(&bs->dirty_bitmaps);
b338082b 368 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
ea2384d3 369 if (device_name[0] != '\0') {
dc364f4c 370 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
ea2384d3 371 }
fbe40ff7
FZ
372 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
373 QLIST_INIT(&bs->op_blockers[i]);
374 }
28a7282a 375 bdrv_iostatus_disable(bs);
d7d512f6 376 notifier_list_init(&bs->close_notifiers);
d616b224 377 notifier_with_return_list_init(&bs->before_write_notifiers);
cc0681c4
BC
378 qemu_co_queue_init(&bs->throttled_reqs[0]);
379 qemu_co_queue_init(&bs->throttled_reqs[1]);
9fcb0251 380 bs->refcnt = 1;
dcd04228 381 bs->aio_context = qemu_get_aio_context();
d7d512f6 382
b338082b
FB
383 return bs;
384}
385
d7d512f6
PB
386void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
387{
388 notifier_list_add(&bs->close_notifiers, notify);
389}
390
ea2384d3
FB
391BlockDriver *bdrv_find_format(const char *format_name)
392{
393 BlockDriver *drv1;
8a22f02a
SH
394 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
395 if (!strcmp(drv1->format_name, format_name)) {
ea2384d3 396 return drv1;
8a22f02a 397 }
ea2384d3
FB
398 }
399 return NULL;
400}
401
b64ec4e4 402static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
eb852011 403{
b64ec4e4
FZ
404 static const char *whitelist_rw[] = {
405 CONFIG_BDRV_RW_WHITELIST
406 };
407 static const char *whitelist_ro[] = {
408 CONFIG_BDRV_RO_WHITELIST
eb852011
MA
409 };
410 const char **p;
411
b64ec4e4 412 if (!whitelist_rw[0] && !whitelist_ro[0]) {
eb852011 413 return 1; /* no whitelist, anything goes */
b64ec4e4 414 }
eb852011 415
b64ec4e4 416 for (p = whitelist_rw; *p; p++) {
eb852011
MA
417 if (!strcmp(drv->format_name, *p)) {
418 return 1;
419 }
420 }
b64ec4e4
FZ
421 if (read_only) {
422 for (p = whitelist_ro; *p; p++) {
423 if (!strcmp(drv->format_name, *p)) {
424 return 1;
425 }
426 }
427 }
eb852011
MA
428 return 0;
429}
430
b64ec4e4
FZ
431BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
432 bool read_only)
eb852011
MA
433{
434 BlockDriver *drv = bdrv_find_format(format_name);
b64ec4e4 435 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
eb852011
MA
436}
437
5b7e1542
ZYW
438typedef struct CreateCo {
439 BlockDriver *drv;
440 char *filename;
83d0521a 441 QemuOpts *opts;
5b7e1542 442 int ret;
cc84d90f 443 Error *err;
5b7e1542
ZYW
444} CreateCo;
445
446static void coroutine_fn bdrv_create_co_entry(void *opaque)
447{
cc84d90f
HR
448 Error *local_err = NULL;
449 int ret;
450
5b7e1542
ZYW
451 CreateCo *cco = opaque;
452 assert(cco->drv);
453
c282e1fd 454 ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
84d18f06 455 if (local_err) {
cc84d90f
HR
456 error_propagate(&cco->err, local_err);
457 }
458 cco->ret = ret;
5b7e1542
ZYW
459}
460
0e7e1989 461int bdrv_create(BlockDriver *drv, const char* filename,
83d0521a 462 QemuOpts *opts, Error **errp)
ea2384d3 463{
5b7e1542
ZYW
464 int ret;
465
466 Coroutine *co;
467 CreateCo cco = {
468 .drv = drv,
469 .filename = g_strdup(filename),
83d0521a 470 .opts = opts,
5b7e1542 471 .ret = NOT_DONE,
cc84d90f 472 .err = NULL,
5b7e1542
ZYW
473 };
474
c282e1fd 475 if (!drv->bdrv_create) {
cc84d90f 476 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
80168bff
LC
477 ret = -ENOTSUP;
478 goto out;
5b7e1542
ZYW
479 }
480
481 if (qemu_in_coroutine()) {
482 /* Fast-path if already in coroutine context */
483 bdrv_create_co_entry(&cco);
484 } else {
485 co = qemu_coroutine_create(bdrv_create_co_entry);
486 qemu_coroutine_enter(co, &cco);
487 while (cco.ret == NOT_DONE) {
b47ec2c4 488 aio_poll(qemu_get_aio_context(), true);
5b7e1542
ZYW
489 }
490 }
491
492 ret = cco.ret;
cc84d90f 493 if (ret < 0) {
84d18f06 494 if (cco.err) {
cc84d90f
HR
495 error_propagate(errp, cco.err);
496 } else {
497 error_setg_errno(errp, -ret, "Could not create image");
498 }
499 }
0e7e1989 500
80168bff
LC
501out:
502 g_free(cco.filename);
5b7e1542 503 return ret;
ea2384d3
FB
504}
505
c282e1fd 506int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
84a12e66
CH
507{
508 BlockDriver *drv;
cc84d90f
HR
509 Error *local_err = NULL;
510 int ret;
84a12e66 511
98289620 512 drv = bdrv_find_protocol(filename, true);
84a12e66 513 if (drv == NULL) {
cc84d90f 514 error_setg(errp, "Could not find protocol for file '%s'", filename);
16905d71 515 return -ENOENT;
84a12e66
CH
516 }
517
c282e1fd 518 ret = bdrv_create(drv, filename, opts, &local_err);
84d18f06 519 if (local_err) {
cc84d90f
HR
520 error_propagate(errp, local_err);
521 }
522 return ret;
84a12e66
CH
523}
524
3baca891 525void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
d34682cd
KW
526{
527 BlockDriver *drv = bs->drv;
3baca891 528 Error *local_err = NULL;
d34682cd
KW
529
530 memset(&bs->bl, 0, sizeof(bs->bl));
531
466ad822 532 if (!drv) {
3baca891 533 return;
466ad822
KW
534 }
535
536 /* Take some limits from the children as a default */
537 if (bs->file) {
3baca891
KW
538 bdrv_refresh_limits(bs->file, &local_err);
539 if (local_err) {
540 error_propagate(errp, local_err);
541 return;
542 }
466ad822 543 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
339064d5
KW
544 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
545 } else {
546 bs->bl.opt_mem_alignment = 512;
466ad822
KW
547 }
548
549 if (bs->backing_hd) {
3baca891
KW
550 bdrv_refresh_limits(bs->backing_hd, &local_err);
551 if (local_err) {
552 error_propagate(errp, local_err);
553 return;
554 }
466ad822
KW
555 bs->bl.opt_transfer_length =
556 MAX(bs->bl.opt_transfer_length,
557 bs->backing_hd->bl.opt_transfer_length);
339064d5
KW
558 bs->bl.opt_mem_alignment =
559 MAX(bs->bl.opt_mem_alignment,
560 bs->backing_hd->bl.opt_mem_alignment);
466ad822
KW
561 }
562
563 /* Then let the driver override it */
564 if (drv->bdrv_refresh_limits) {
3baca891 565 drv->bdrv_refresh_limits(bs, errp);
d34682cd 566 }
d34682cd
KW
567}
568
eba25057
JM
569/*
570 * Create a uniquely-named empty temporary file.
571 * Return 0 upon success, otherwise a negative errno value.
572 */
573int get_tmp_filename(char *filename, int size)
d5249393 574{
eba25057 575#ifdef _WIN32
3b9f94e1 576 char temp_dir[MAX_PATH];
eba25057
JM
577 /* GetTempFileName requires that its output buffer (4th param)
578 have length MAX_PATH or greater. */
579 assert(size >= MAX_PATH);
580 return (GetTempPath(MAX_PATH, temp_dir)
581 && GetTempFileName(temp_dir, "qem", 0, filename)
582 ? 0 : -GetLastError());
d5249393 583#else
67b915a5 584 int fd;
7ccfb2eb 585 const char *tmpdir;
0badc1ee 586 tmpdir = getenv("TMPDIR");
69bef793
AS
587 if (!tmpdir) {
588 tmpdir = "/var/tmp";
589 }
eba25057
JM
590 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
591 return -EOVERFLOW;
592 }
ea2384d3 593 fd = mkstemp(filename);
fe235a06
DH
594 if (fd < 0) {
595 return -errno;
596 }
597 if (close(fd) != 0) {
598 unlink(filename);
eba25057
JM
599 return -errno;
600 }
601 return 0;
d5249393 602#endif
eba25057 603}
fc01f7e7 604
84a12e66
CH
605/*
606 * Detect host devices. By convention, /dev/cdrom[N] is always
607 * recognized as a host CDROM.
608 */
609static BlockDriver *find_hdev_driver(const char *filename)
610{
611 int score_max = 0, score;
612 BlockDriver *drv = NULL, *d;
613
614 QLIST_FOREACH(d, &bdrv_drivers, list) {
615 if (d->bdrv_probe_device) {
616 score = d->bdrv_probe_device(filename);
617 if (score > score_max) {
618 score_max = score;
619 drv = d;
620 }
621 }
622 }
623
624 return drv;
625}
626
98289620
KW
627BlockDriver *bdrv_find_protocol(const char *filename,
628 bool allow_protocol_prefix)
83f64091
FB
629{
630 BlockDriver *drv1;
631 char protocol[128];
1cec71e3 632 int len;
83f64091 633 const char *p;
19cb3738 634
66f82cee
KW
635 /* TODO Drivers without bdrv_file_open must be specified explicitly */
636
39508e7a
CH
637 /*
638 * XXX(hch): we really should not let host device detection
639 * override an explicit protocol specification, but moving this
640 * later breaks access to device names with colons in them.
641 * Thanks to the brain-dead persistent naming schemes on udev-
642 * based Linux systems those actually are quite common.
643 */
644 drv1 = find_hdev_driver(filename);
645 if (drv1) {
646 return drv1;
647 }
648
98289620 649 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
39508e7a 650 return bdrv_find_format("file");
84a12e66 651 }
98289620 652
9e0b22f4
SH
653 p = strchr(filename, ':');
654 assert(p != NULL);
1cec71e3
AL
655 len = p - filename;
656 if (len > sizeof(protocol) - 1)
657 len = sizeof(protocol) - 1;
658 memcpy(protocol, filename, len);
659 protocol[len] = '\0';
8a22f02a 660 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
5fafdf24 661 if (drv1->protocol_name &&
8a22f02a 662 !strcmp(drv1->protocol_name, protocol)) {
83f64091 663 return drv1;
8a22f02a 664 }
83f64091
FB
665 }
666 return NULL;
667}
668
f500a6d3 669static int find_image_format(BlockDriverState *bs, const char *filename,
34b5d2c6 670 BlockDriver **pdrv, Error **errp)
f3a5d3f8 671{
f500a6d3 672 int score, score_max;
f3a5d3f8
CH
673 BlockDriver *drv1, *drv;
674 uint8_t buf[2048];
f500a6d3 675 int ret = 0;
f8ea0b00 676
08a00559 677 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
8e895599 678 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
c98ac35d
SW
679 drv = bdrv_find_format("raw");
680 if (!drv) {
34b5d2c6 681 error_setg(errp, "Could not find raw image format");
c98ac35d
SW
682 ret = -ENOENT;
683 }
684 *pdrv = drv;
685 return ret;
1a396859 686 }
f8ea0b00 687
83f64091 688 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
83f64091 689 if (ret < 0) {
34b5d2c6
HR
690 error_setg_errno(errp, -ret, "Could not read image for determining its "
691 "format");
c98ac35d
SW
692 *pdrv = NULL;
693 return ret;
83f64091
FB
694 }
695
ea2384d3 696 score_max = 0;
84a12e66 697 drv = NULL;
8a22f02a 698 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
83f64091
FB
699 if (drv1->bdrv_probe) {
700 score = drv1->bdrv_probe(buf, ret, filename);
701 if (score > score_max) {
702 score_max = score;
703 drv = drv1;
704 }
0849bf08 705 }
fc01f7e7 706 }
c98ac35d 707 if (!drv) {
34b5d2c6
HR
708 error_setg(errp, "Could not determine image format: No compatible "
709 "driver found");
c98ac35d
SW
710 ret = -ENOENT;
711 }
712 *pdrv = drv;
713 return ret;
ea2384d3
FB
714}
715
51762288
SH
716/**
717 * Set the current 'total_sectors' value
65a9bb25 718 * Return 0 on success, -errno on error.
51762288
SH
719 */
720static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
721{
722 BlockDriver *drv = bs->drv;
723
396759ad
NB
724 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
725 if (bs->sg)
726 return 0;
727
51762288
SH
728 /* query actual device if possible, otherwise just trust the hint */
729 if (drv->bdrv_getlength) {
730 int64_t length = drv->bdrv_getlength(bs);
731 if (length < 0) {
732 return length;
733 }
7e382003 734 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
51762288
SH
735 }
736
737 bs->total_sectors = hint;
738 return 0;
739}
740
9e8f1835
PB
741/**
742 * Set open flags for a given discard mode
743 *
744 * Return 0 on success, -1 if the discard mode was invalid.
745 */
746int bdrv_parse_discard_flags(const char *mode, int *flags)
747{
748 *flags &= ~BDRV_O_UNMAP;
749
750 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
751 /* do nothing */
752 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
753 *flags |= BDRV_O_UNMAP;
754 } else {
755 return -1;
756 }
757
758 return 0;
759}
760
c3993cdc
SH
761/**
762 * Set open flags for a given cache mode
763 *
764 * Return 0 on success, -1 if the cache mode was invalid.
765 */
766int bdrv_parse_cache_flags(const char *mode, int *flags)
767{
768 *flags &= ~BDRV_O_CACHE_MASK;
769
770 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
771 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
92196b2f
SH
772 } else if (!strcmp(mode, "directsync")) {
773 *flags |= BDRV_O_NOCACHE;
c3993cdc
SH
774 } else if (!strcmp(mode, "writeback")) {
775 *flags |= BDRV_O_CACHE_WB;
776 } else if (!strcmp(mode, "unsafe")) {
777 *flags |= BDRV_O_CACHE_WB;
778 *flags |= BDRV_O_NO_FLUSH;
779 } else if (!strcmp(mode, "writethrough")) {
780 /* this is the default */
781 } else {
782 return -1;
783 }
784
785 return 0;
786}
787
53fec9d3
SH
788/**
789 * The copy-on-read flag is actually a reference count so multiple users may
790 * use the feature without worrying about clobbering its previous state.
791 * Copy-on-read stays enabled until all users have called to disable it.
792 */
793void bdrv_enable_copy_on_read(BlockDriverState *bs)
794{
795 bs->copy_on_read++;
796}
797
798void bdrv_disable_copy_on_read(BlockDriverState *bs)
799{
800 assert(bs->copy_on_read > 0);
801 bs->copy_on_read--;
802}
803
b1e6fc08
KW
804/*
805 * Returns the flags that a temporary snapshot should get, based on the
806 * originally requested flags (the originally requested image will have flags
807 * like a backing file)
808 */
809static int bdrv_temp_snapshot_flags(int flags)
810{
811 return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
812}
813
0b50cc88
KW
814/*
815 * Returns the flags that bs->file should get, based on the given flags for
816 * the parent BDS
817 */
818static int bdrv_inherited_flags(int flags)
819{
820 /* Enable protocol handling, disable format probing for bs->file */
821 flags |= BDRV_O_PROTOCOL;
822
823 /* Our block drivers take care to send flushes and respect unmap policy,
824 * so we can enable both unconditionally on lower layers. */
825 flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
826
0b50cc88 827 /* Clear flags that only apply to the top layer */
5669b44d 828 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
0b50cc88
KW
829
830 return flags;
831}
832
317fc44e
KW
833/*
834 * Returns the flags that bs->backing_hd should get, based on the given flags
835 * for the parent BDS
836 */
837static int bdrv_backing_flags(int flags)
838{
839 /* backing files always opened read-only */
840 flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
841
842 /* snapshot=on is handled on the top layer */
8bfea15d 843 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
317fc44e
KW
844
845 return flags;
846}
847
7b272452
KW
848static int bdrv_open_flags(BlockDriverState *bs, int flags)
849{
850 int open_flags = flags | BDRV_O_CACHE_WB;
851
852 /*
853 * Clear flags that are internal to the block layer before opening the
854 * image.
855 */
20cca275 856 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
7b272452
KW
857
858 /*
859 * Snapshots should be writable.
860 */
8bfea15d 861 if (flags & BDRV_O_TEMPORARY) {
7b272452
KW
862 open_flags |= BDRV_O_RDWR;
863 }
864
865 return open_flags;
866}
867
636ea370
KW
868static void bdrv_assign_node_name(BlockDriverState *bs,
869 const char *node_name,
870 Error **errp)
6913c0c2
BC
871{
872 if (!node_name) {
636ea370 873 return;
6913c0c2
BC
874 }
875
9aebf3b8
KW
876 /* Check for empty string or invalid characters */
877 if (!bdrv_is_valid_name(node_name)) {
878 error_setg(errp, "Invalid node name");
636ea370 879 return;
6913c0c2
BC
880 }
881
0c5e94ee
BC
882 /* takes care of avoiding namespaces collisions */
883 if (bdrv_find(node_name)) {
884 error_setg(errp, "node-name=%s is conflicting with a device id",
885 node_name);
636ea370 886 return;
0c5e94ee
BC
887 }
888
6913c0c2
BC
889 /* takes care of avoiding duplicates node names */
890 if (bdrv_find_node(node_name)) {
891 error_setg(errp, "Duplicate node name");
636ea370 892 return;
6913c0c2
BC
893 }
894
895 /* copy node name into the bs and insert it into the graph list */
896 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
897 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
6913c0c2
BC
898}
899
57915332
KW
900/*
901 * Common part for opening disk images and files
b6ad491a
KW
902 *
903 * Removes all processed options from *options.
57915332 904 */
f500a6d3 905static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
34b5d2c6 906 QDict *options, int flags, BlockDriver *drv, Error **errp)
57915332
KW
907{
908 int ret, open_flags;
035fccdf 909 const char *filename;
6913c0c2 910 const char *node_name = NULL;
34b5d2c6 911 Error *local_err = NULL;
57915332
KW
912
913 assert(drv != NULL);
6405875c 914 assert(bs->file == NULL);
707ff828 915 assert(options != NULL && bs->options != options);
57915332 916
45673671
KW
917 if (file != NULL) {
918 filename = file->filename;
919 } else {
920 filename = qdict_get_try_str(options, "filename");
921 }
922
765003db
KW
923 if (drv->bdrv_needs_filename && !filename) {
924 error_setg(errp, "The '%s' block driver requires a file name",
925 drv->format_name);
926 return -EINVAL;
927 }
928
45673671 929 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
28dcee10 930
6913c0c2 931 node_name = qdict_get_try_str(options, "node-name");
636ea370 932 bdrv_assign_node_name(bs, node_name, &local_err);
0fb6395c 933 if (local_err) {
636ea370
KW
934 error_propagate(errp, local_err);
935 return -EINVAL;
6913c0c2
BC
936 }
937 qdict_del(options, "node-name");
938
5d186eb0
KW
939 /* bdrv_open() with directly using a protocol as drv. This layer is already
940 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
941 * and return immediately. */
942 if (file != NULL && drv->bdrv_file_open) {
943 bdrv_swap(file, bs);
944 return 0;
945 }
946
57915332 947 bs->open_flags = flags;
1b7fd729 948 bs->guest_block_size = 512;
c25f53b0 949 bs->request_alignment = 512;
0d51b4de 950 bs->zero_beyond_eof = true;
b64ec4e4
FZ
951 open_flags = bdrv_open_flags(bs, flags);
952 bs->read_only = !(open_flags & BDRV_O_RDWR);
20cca275 953 bs->growable = !!(flags & BDRV_O_PROTOCOL);
b64ec4e4
FZ
954
955 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
8f94a6e4
KW
956 error_setg(errp,
957 !bs->read_only && bdrv_is_whitelisted(drv, true)
958 ? "Driver '%s' can only be used for read-only devices"
959 : "Driver '%s' is not whitelisted",
960 drv->format_name);
b64ec4e4
FZ
961 return -ENOTSUP;
962 }
57915332 963
53fec9d3 964 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
0ebd24e0
KW
965 if (flags & BDRV_O_COPY_ON_READ) {
966 if (!bs->read_only) {
967 bdrv_enable_copy_on_read(bs);
968 } else {
969 error_setg(errp, "Can't use copy-on-read on read-only device");
970 return -EINVAL;
971 }
53fec9d3
SH
972 }
973
c2ad1b0c
KW
974 if (filename != NULL) {
975 pstrcpy(bs->filename, sizeof(bs->filename), filename);
976 } else {
977 bs->filename[0] = '\0';
978 }
91af7014 979 pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
57915332 980
57915332 981 bs->drv = drv;
7267c094 982 bs->opaque = g_malloc0(drv->instance_size);
57915332 983
03f541bd 984 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
e7c63796 985
66f82cee
KW
986 /* Open the image, either directly or using a protocol */
987 if (drv->bdrv_file_open) {
5d186eb0 988 assert(file == NULL);
030be321 989 assert(!drv->bdrv_needs_filename || filename != NULL);
34b5d2c6 990 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
f500a6d3 991 } else {
2af5ef70 992 if (file == NULL) {
34b5d2c6
HR
993 error_setg(errp, "Can't use '%s' as a block driver for the "
994 "protocol level", drv->format_name);
2af5ef70
KW
995 ret = -EINVAL;
996 goto free_and_fail;
997 }
f500a6d3 998 bs->file = file;
34b5d2c6 999 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
66f82cee
KW
1000 }
1001
57915332 1002 if (ret < 0) {
84d18f06 1003 if (local_err) {
34b5d2c6 1004 error_propagate(errp, local_err);
2fa9aa59
DH
1005 } else if (bs->filename[0]) {
1006 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
34b5d2c6
HR
1007 } else {
1008 error_setg_errno(errp, -ret, "Could not open image");
1009 }
57915332
KW
1010 goto free_and_fail;
1011 }
1012
51762288
SH
1013 ret = refresh_total_sectors(bs, bs->total_sectors);
1014 if (ret < 0) {
34b5d2c6 1015 error_setg_errno(errp, -ret, "Could not refresh total sector count");
51762288 1016 goto free_and_fail;
57915332 1017 }
51762288 1018
3baca891
KW
1019 bdrv_refresh_limits(bs, &local_err);
1020 if (local_err) {
1021 error_propagate(errp, local_err);
1022 ret = -EINVAL;
1023 goto free_and_fail;
1024 }
1025
c25f53b0 1026 assert(bdrv_opt_mem_align(bs) != 0);
47ea2de2 1027 assert((bs->request_alignment != 0) || bs->sg);
57915332
KW
1028 return 0;
1029
1030free_and_fail:
f500a6d3 1031 bs->file = NULL;
7267c094 1032 g_free(bs->opaque);
57915332
KW
1033 bs->opaque = NULL;
1034 bs->drv = NULL;
1035 return ret;
1036}
1037
5e5c4f63
KW
1038static QDict *parse_json_filename(const char *filename, Error **errp)
1039{
1040 QObject *options_obj;
1041 QDict *options;
1042 int ret;
1043
1044 ret = strstart(filename, "json:", &filename);
1045 assert(ret);
1046
1047 options_obj = qobject_from_json(filename);
1048 if (!options_obj) {
1049 error_setg(errp, "Could not parse the JSON options");
1050 return NULL;
1051 }
1052
1053 if (qobject_type(options_obj) != QTYPE_QDICT) {
1054 qobject_decref(options_obj);
1055 error_setg(errp, "Invalid JSON object given");
1056 return NULL;
1057 }
1058
1059 options = qobject_to_qdict(options_obj);
1060 qdict_flatten(options);
1061
1062 return options;
1063}
1064
b6ce07aa 1065/*
f54120ff
KW
1066 * Fills in default options for opening images and converts the legacy
1067 * filename/flags pair to option QDict entries.
b6ce07aa 1068 */
5e5c4f63 1069static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
17b005f1 1070 BlockDriver *drv, Error **errp)
ea2384d3 1071{
5e5c4f63 1072 const char *filename = *pfilename;
c2ad1b0c 1073 const char *drvname;
462f5bcf 1074 bool protocol = flags & BDRV_O_PROTOCOL;
e3fa4bfa 1075 bool parse_filename = false;
34b5d2c6 1076 Error *local_err = NULL;
83f64091 1077
5e5c4f63
KW
1078 /* Parse json: pseudo-protocol */
1079 if (filename && g_str_has_prefix(filename, "json:")) {
1080 QDict *json_options = parse_json_filename(filename, &local_err);
1081 if (local_err) {
1082 error_propagate(errp, local_err);
1083 return -EINVAL;
1084 }
1085
1086 /* Options given in the filename have lower priority than options
1087 * specified directly */
1088 qdict_join(*options, json_options, false);
1089 QDECREF(json_options);
1090 *pfilename = filename = NULL;
1091 }
1092
035fccdf 1093 /* Fetch the file name from the options QDict if necessary */
17b005f1 1094 if (protocol && filename) {
f54120ff
KW
1095 if (!qdict_haskey(*options, "filename")) {
1096 qdict_put(*options, "filename", qstring_from_str(filename));
1097 parse_filename = true;
1098 } else {
1099 error_setg(errp, "Can't specify 'file' and 'filename' options at "
1100 "the same time");
1101 return -EINVAL;
1102 }
035fccdf
KW
1103 }
1104
c2ad1b0c 1105 /* Find the right block driver */
f54120ff 1106 filename = qdict_get_try_str(*options, "filename");
5acd9d81 1107 drvname = qdict_get_try_str(*options, "driver");
f54120ff 1108
17b005f1
KW
1109 if (drv) {
1110 if (drvname) {
1111 error_setg(errp, "Driver specified twice");
1112 return -EINVAL;
1113 }
1114 drvname = drv->format_name;
1115 qdict_put(*options, "driver", qstring_from_str(drvname));
1116 } else {
1117 if (!drvname && protocol) {
1118 if (filename) {
1119 drv = bdrv_find_protocol(filename, parse_filename);
1120 if (!drv) {
1121 error_setg(errp, "Unknown protocol");
1122 return -EINVAL;
1123 }
1124
1125 drvname = drv->format_name;
1126 qdict_put(*options, "driver", qstring_from_str(drvname));
1127 } else {
1128 error_setg(errp, "Must specify either driver or file");
f54120ff
KW
1129 return -EINVAL;
1130 }
17b005f1
KW
1131 } else if (drvname) {
1132 drv = bdrv_find_format(drvname);
1133 if (!drv) {
1134 error_setg(errp, "Unknown driver '%s'", drvname);
1135 return -ENOENT;
1136 }
98289620 1137 }
c2ad1b0c
KW
1138 }
1139
17b005f1 1140 assert(drv || !protocol);
c2ad1b0c 1141
f54120ff 1142 /* Driver-specific filename parsing */
17b005f1 1143 if (drv && drv->bdrv_parse_filename && parse_filename) {
5acd9d81 1144 drv->bdrv_parse_filename(filename, *options, &local_err);
84d18f06 1145 if (local_err) {
34b5d2c6 1146 error_propagate(errp, local_err);
f54120ff 1147 return -EINVAL;
6963a30d 1148 }
cd5d031e
HR
1149
1150 if (!drv->bdrv_needs_filename) {
1151 qdict_del(*options, "filename");
cd5d031e 1152 }
6963a30d
KW
1153 }
1154
f54120ff
KW
1155 return 0;
1156}
1157
8d24cce1
FZ
1158void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1159{
1160
826b6ca0
FZ
1161 if (bs->backing_hd) {
1162 assert(bs->backing_blocker);
1163 bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1164 } else if (backing_hd) {
1165 error_setg(&bs->backing_blocker,
1166 "device is used as backing hd of '%s'",
1167 bs->device_name);
1168 }
1169
8d24cce1
FZ
1170 bs->backing_hd = backing_hd;
1171 if (!backing_hd) {
826b6ca0
FZ
1172 error_free(bs->backing_blocker);
1173 bs->backing_blocker = NULL;
8d24cce1
FZ
1174 goto out;
1175 }
1176 bs->open_flags &= ~BDRV_O_NO_BACKING;
1177 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1178 pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1179 backing_hd->drv ? backing_hd->drv->format_name : "");
826b6ca0
FZ
1180
1181 bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1182 /* Otherwise we won't be able to commit due to check in bdrv_commit */
1183 bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
1184 bs->backing_blocker);
8d24cce1 1185out:
3baca891 1186 bdrv_refresh_limits(bs, NULL);
8d24cce1
FZ
1187}
1188
31ca6d07
KW
1189/*
1190 * Opens the backing file for a BlockDriverState if not yet open
1191 *
1192 * options is a QDict of options to pass to the block drivers, or NULL for an
1193 * empty set of options. The reference to the QDict is transferred to this
1194 * function (even on failure), so if the caller intends to reuse the dictionary,
1195 * it needs to use QINCREF() before calling bdrv_file_open.
1196 */
34b5d2c6 1197int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
9156df12 1198{
1ba4b6a5 1199 char *backing_filename = g_malloc0(PATH_MAX);
317fc44e 1200 int ret = 0;
9156df12 1201 BlockDriver *back_drv = NULL;
8d24cce1 1202 BlockDriverState *backing_hd;
34b5d2c6 1203 Error *local_err = NULL;
9156df12
PB
1204
1205 if (bs->backing_hd != NULL) {
31ca6d07 1206 QDECREF(options);
1ba4b6a5 1207 goto free_exit;
9156df12
PB
1208 }
1209
31ca6d07
KW
1210 /* NULL means an empty set of options */
1211 if (options == NULL) {
1212 options = qdict_new();
1213 }
1214
9156df12 1215 bs->open_flags &= ~BDRV_O_NO_BACKING;
1cb6f506
KW
1216 if (qdict_haskey(options, "file.filename")) {
1217 backing_filename[0] = '\0';
1218 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
31ca6d07 1219 QDECREF(options);
1ba4b6a5 1220 goto free_exit;
dbecebdd 1221 } else {
1ba4b6a5 1222 bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
9156df12
PB
1223 }
1224
8ee79e70
KW
1225 if (!bs->drv || !bs->drv->supports_backing) {
1226 ret = -EINVAL;
1227 error_setg(errp, "Driver doesn't support backing files");
1228 QDECREF(options);
1229 goto free_exit;
1230 }
1231
8d24cce1
FZ
1232 backing_hd = bdrv_new("", errp);
1233
9156df12
PB
1234 if (bs->backing_format[0] != '\0') {
1235 back_drv = bdrv_find_format(bs->backing_format);
1236 }
1237
f67503e5 1238 assert(bs->backing_hd == NULL);
8d24cce1 1239 ret = bdrv_open(&backing_hd,
ddf5636d 1240 *backing_filename ? backing_filename : NULL, NULL, options,
317fc44e 1241 bdrv_backing_flags(bs->open_flags), back_drv, &local_err);
9156df12 1242 if (ret < 0) {
8d24cce1
FZ
1243 bdrv_unref(backing_hd);
1244 backing_hd = NULL;
9156df12 1245 bs->open_flags |= BDRV_O_NO_BACKING;
b04b6b6e
FZ
1246 error_setg(errp, "Could not open backing file: %s",
1247 error_get_pretty(local_err));
1248 error_free(local_err);
1ba4b6a5 1249 goto free_exit;
9156df12 1250 }
8d24cce1 1251 bdrv_set_backing_hd(bs, backing_hd);
d80ac658 1252
1ba4b6a5
BC
1253free_exit:
1254 g_free(backing_filename);
1255 return ret;
9156df12
PB
1256}
1257
da557aac
HR
1258/*
1259 * Opens a disk image whose options are given as BlockdevRef in another block
1260 * device's options.
1261 *
da557aac
HR
1262 * If allow_none is true, no image will be opened if filename is false and no
1263 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1264 *
1265 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1266 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1267 * itself, all options starting with "${bdref_key}." are considered part of the
1268 * BlockdevRef.
1269 *
1270 * The BlockdevRef will be removed from the options QDict.
f67503e5
HR
1271 *
1272 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
da557aac
HR
1273 */
1274int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1275 QDict *options, const char *bdref_key, int flags,
f7d9fd8c 1276 bool allow_none, Error **errp)
da557aac
HR
1277{
1278 QDict *image_options;
1279 int ret;
1280 char *bdref_key_dot;
1281 const char *reference;
1282
f67503e5
HR
1283 assert(pbs);
1284 assert(*pbs == NULL);
1285
da557aac
HR
1286 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1287 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1288 g_free(bdref_key_dot);
1289
1290 reference = qdict_get_try_str(options, bdref_key);
1291 if (!filename && !reference && !qdict_size(image_options)) {
1292 if (allow_none) {
1293 ret = 0;
1294 } else {
1295 error_setg(errp, "A block device must be specified for \"%s\"",
1296 bdref_key);
1297 ret = -EINVAL;
1298 }
b20e61e0 1299 QDECREF(image_options);
da557aac
HR
1300 goto done;
1301 }
1302
f7d9fd8c 1303 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
da557aac
HR
1304
1305done:
1306 qdict_del(options, bdref_key);
1307 return ret;
1308}
1309
6b8aeca5 1310int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
b998875d
KW
1311{
1312 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1ba4b6a5 1313 char *tmp_filename = g_malloc0(PATH_MAX + 1);
b998875d
KW
1314 int64_t total_size;
1315 BlockDriver *bdrv_qcow2;
83d0521a 1316 QemuOpts *opts = NULL;
b998875d
KW
1317 QDict *snapshot_options;
1318 BlockDriverState *bs_snapshot;
1319 Error *local_err;
1320 int ret;
1321
1322 /* if snapshot, we create a temporary backing file and open it
1323 instead of opening 'filename' directly */
1324
1325 /* Get the required size from the image */
f187743a
KW
1326 total_size = bdrv_getlength(bs);
1327 if (total_size < 0) {
6b8aeca5 1328 ret = total_size;
f187743a 1329 error_setg_errno(errp, -total_size, "Could not get image size");
1ba4b6a5 1330 goto out;
f187743a 1331 }
b998875d
KW
1332
1333 /* Create the temporary image */
1ba4b6a5 1334 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
b998875d
KW
1335 if (ret < 0) {
1336 error_setg_errno(errp, -ret, "Could not get temporary filename");
1ba4b6a5 1337 goto out;
b998875d
KW
1338 }
1339
1340 bdrv_qcow2 = bdrv_find_format("qcow2");
c282e1fd
CL
1341 opts = qemu_opts_create(bdrv_qcow2->create_opts, NULL, 0,
1342 &error_abort);
83d0521a 1343 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size);
c282e1fd 1344 ret = bdrv_create(bdrv_qcow2, tmp_filename, opts, &local_err);
83d0521a 1345 qemu_opts_del(opts);
b998875d
KW
1346 if (ret < 0) {
1347 error_setg_errno(errp, -ret, "Could not create temporary overlay "
1348 "'%s': %s", tmp_filename,
1349 error_get_pretty(local_err));
1350 error_free(local_err);
1ba4b6a5 1351 goto out;
b998875d
KW
1352 }
1353
1354 /* Prepare a new options QDict for the temporary file */
1355 snapshot_options = qdict_new();
1356 qdict_put(snapshot_options, "file.driver",
1357 qstring_from_str("file"));
1358 qdict_put(snapshot_options, "file.filename",
1359 qstring_from_str(tmp_filename));
1360
98522f63 1361 bs_snapshot = bdrv_new("", &error_abort);
b998875d
KW
1362
1363 ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
b1e6fc08 1364 flags, bdrv_qcow2, &local_err);
b998875d
KW
1365 if (ret < 0) {
1366 error_propagate(errp, local_err);
1ba4b6a5 1367 goto out;
b998875d
KW
1368 }
1369
1370 bdrv_append(bs_snapshot, bs);
1ba4b6a5
BC
1371
1372out:
1373 g_free(tmp_filename);
6b8aeca5 1374 return ret;
b998875d
KW
1375}
1376
b6ce07aa
KW
1377/*
1378 * Opens a disk image (raw, qcow2, vmdk, ...)
de9c0cec
KW
1379 *
1380 * options is a QDict of options to pass to the block drivers, or NULL for an
1381 * empty set of options. The reference to the QDict belongs to the block layer
1382 * after the call (even on failure), so if the caller intends to reuse the
1383 * dictionary, it needs to use QINCREF() before calling bdrv_open.
f67503e5
HR
1384 *
1385 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1386 * If it is not NULL, the referenced BDS will be reused.
ddf5636d
HR
1387 *
1388 * The reference parameter may be used to specify an existing block device which
1389 * should be opened. If specified, neither options nor a filename may be given,
1390 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
b6ce07aa 1391 */
ddf5636d
HR
1392int bdrv_open(BlockDriverState **pbs, const char *filename,
1393 const char *reference, QDict *options, int flags,
1394 BlockDriver *drv, Error **errp)
ea2384d3 1395{
b6ce07aa 1396 int ret;
f67503e5 1397 BlockDriverState *file = NULL, *bs;
74fe54f2 1398 const char *drvname;
34b5d2c6 1399 Error *local_err = NULL;
b1e6fc08 1400 int snapshot_flags = 0;
712e7874 1401
f67503e5
HR
1402 assert(pbs);
1403
ddf5636d
HR
1404 if (reference) {
1405 bool options_non_empty = options ? qdict_size(options) : false;
1406 QDECREF(options);
1407
1408 if (*pbs) {
1409 error_setg(errp, "Cannot reuse an existing BDS when referencing "
1410 "another block device");
1411 return -EINVAL;
1412 }
1413
1414 if (filename || options_non_empty) {
1415 error_setg(errp, "Cannot reference an existing block device with "
1416 "additional options or a new filename");
1417 return -EINVAL;
1418 }
1419
1420 bs = bdrv_lookup_bs(reference, reference, errp);
1421 if (!bs) {
1422 return -ENODEV;
1423 }
1424 bdrv_ref(bs);
1425 *pbs = bs;
1426 return 0;
1427 }
1428
f67503e5
HR
1429 if (*pbs) {
1430 bs = *pbs;
1431 } else {
98522f63 1432 bs = bdrv_new("", &error_abort);
f67503e5
HR
1433 }
1434
de9c0cec
KW
1435 /* NULL means an empty set of options */
1436 if (options == NULL) {
1437 options = qdict_new();
1438 }
1439
17b005f1 1440 ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
462f5bcf
KW
1441 if (local_err) {
1442 goto fail;
1443 }
1444
76c591b0
KW
1445 /* Find the right image format driver */
1446 drv = NULL;
1447 drvname = qdict_get_try_str(options, "driver");
1448 if (drvname) {
1449 drv = bdrv_find_format(drvname);
1450 qdict_del(options, "driver");
1451 if (!drv) {
1452 error_setg(errp, "Unknown driver: '%s'", drvname);
1453 ret = -EINVAL;
1454 goto fail;
1455 }
1456 }
1457
1458 assert(drvname || !(flags & BDRV_O_PROTOCOL));
1459 if (drv && !drv->bdrv_file_open) {
1460 /* If the user explicitly wants a format driver here, we'll need to add
1461 * another layer for the protocol in bs->file */
1462 flags &= ~BDRV_O_PROTOCOL;
1463 }
1464
de9c0cec 1465 bs->options = options;
b6ad491a 1466 options = qdict_clone_shallow(options);
de9c0cec 1467
f500a6d3 1468 /* Open image file without format layer */
f4788adc
KW
1469 if ((flags & BDRV_O_PROTOCOL) == 0) {
1470 if (flags & BDRV_O_RDWR) {
1471 flags |= BDRV_O_ALLOW_RDWR;
1472 }
1473 if (flags & BDRV_O_SNAPSHOT) {
1474 snapshot_flags = bdrv_temp_snapshot_flags(flags);
1475 flags = bdrv_backing_flags(flags);
1476 }
f500a6d3 1477
f4788adc
KW
1478 assert(file == NULL);
1479 ret = bdrv_open_image(&file, filename, options, "file",
1480 bdrv_inherited_flags(flags),
1481 true, &local_err);
1482 if (ret < 0) {
1483 goto fail;
1484 }
f500a6d3
KW
1485 }
1486
76c591b0
KW
1487 /* Image format probing */
1488 if (!drv && file) {
17b005f1
KW
1489 ret = find_image_format(file, filename, &drv, &local_err);
1490 if (ret < 0) {
8bfea15d 1491 goto fail;
2a05cbe4 1492 }
76c591b0 1493 } else if (!drv) {
17b005f1
KW
1494 error_setg(errp, "Must specify either driver or file");
1495 ret = -EINVAL;
8bfea15d 1496 goto fail;
ea2384d3 1497 }
b6ce07aa
KW
1498
1499 /* Open the image */
34b5d2c6 1500 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
b6ce07aa 1501 if (ret < 0) {
8bfea15d 1502 goto fail;
6987307c
CH
1503 }
1504
2a05cbe4 1505 if (file && (bs->file != file)) {
4f6fd349 1506 bdrv_unref(file);
f500a6d3
KW
1507 file = NULL;
1508 }
1509
b6ce07aa 1510 /* If there is a backing file, use it */
9156df12 1511 if ((flags & BDRV_O_NO_BACKING) == 0) {
31ca6d07
KW
1512 QDict *backing_options;
1513
5726d872 1514 qdict_extract_subqdict(options, &backing_options, "backing.");
34b5d2c6 1515 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
b6ce07aa 1516 if (ret < 0) {
b6ad491a 1517 goto close_and_fail;
b6ce07aa 1518 }
b6ce07aa
KW
1519 }
1520
91af7014
HR
1521 bdrv_refresh_filename(bs);
1522
b998875d
KW
1523 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1524 * temporary snapshot afterwards. */
b1e6fc08 1525 if (snapshot_flags) {
6b8aeca5 1526 ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
b998875d 1527 if (local_err) {
b998875d
KW
1528 goto close_and_fail;
1529 }
1530 }
1531
b6ad491a 1532 /* Check if any unknown options were used */
5acd9d81 1533 if (options && (qdict_size(options) != 0)) {
b6ad491a 1534 const QDictEntry *entry = qdict_first(options);
5acd9d81
HR
1535 if (flags & BDRV_O_PROTOCOL) {
1536 error_setg(errp, "Block protocol '%s' doesn't support the option "
1537 "'%s'", drv->format_name, entry->key);
1538 } else {
1539 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1540 "support the option '%s'", drv->format_name,
1541 bs->device_name, entry->key);
1542 }
b6ad491a
KW
1543
1544 ret = -EINVAL;
1545 goto close_and_fail;
1546 }
b6ad491a 1547
b6ce07aa 1548 if (!bdrv_key_required(bs)) {
7d4b4ba5 1549 bdrv_dev_change_media_cb(bs, true);
c3adb58f
MA
1550 } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1551 && !runstate_check(RUN_STATE_INMIGRATE)
1552 && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1553 error_setg(errp,
1554 "Guest must be stopped for opening of encrypted image");
1555 ret = -EBUSY;
1556 goto close_and_fail;
b6ce07aa
KW
1557 }
1558
c3adb58f 1559 QDECREF(options);
f67503e5 1560 *pbs = bs;
b6ce07aa
KW
1561 return 0;
1562
8bfea15d 1563fail:
f500a6d3 1564 if (file != NULL) {
4f6fd349 1565 bdrv_unref(file);
f500a6d3 1566 }
de9c0cec 1567 QDECREF(bs->options);
b6ad491a 1568 QDECREF(options);
de9c0cec 1569 bs->options = NULL;
f67503e5
HR
1570 if (!*pbs) {
1571 /* If *pbs is NULL, a new BDS has been created in this function and
1572 needs to be freed now. Otherwise, it does not need to be closed,
1573 since it has not really been opened yet. */
1574 bdrv_unref(bs);
1575 }
84d18f06 1576 if (local_err) {
34b5d2c6
HR
1577 error_propagate(errp, local_err);
1578 }
b6ad491a 1579 return ret;
de9c0cec 1580
b6ad491a 1581close_and_fail:
f67503e5
HR
1582 /* See fail path, but now the BDS has to be always closed */
1583 if (*pbs) {
1584 bdrv_close(bs);
1585 } else {
1586 bdrv_unref(bs);
1587 }
b6ad491a 1588 QDECREF(options);
84d18f06 1589 if (local_err) {
34b5d2c6
HR
1590 error_propagate(errp, local_err);
1591 }
b6ce07aa
KW
1592 return ret;
1593}
1594
e971aa12
JC
1595typedef struct BlockReopenQueueEntry {
1596 bool prepared;
1597 BDRVReopenState state;
1598 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1599} BlockReopenQueueEntry;
1600
1601/*
1602 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1603 * reopen of multiple devices.
1604 *
1605 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1606 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1607 * be created and initialized. This newly created BlockReopenQueue should be
1608 * passed back in for subsequent calls that are intended to be of the same
1609 * atomic 'set'.
1610 *
1611 * bs is the BlockDriverState to add to the reopen queue.
1612 *
1613 * flags contains the open flags for the associated bs
1614 *
1615 * returns a pointer to bs_queue, which is either the newly allocated
1616 * bs_queue, or the existing bs_queue being used.
1617 *
1618 */
1619BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1620 BlockDriverState *bs, int flags)
1621{
1622 assert(bs != NULL);
1623
1624 BlockReopenQueueEntry *bs_entry;
1625 if (bs_queue == NULL) {
1626 bs_queue = g_new0(BlockReopenQueue, 1);
1627 QSIMPLEQ_INIT(bs_queue);
1628 }
1629
f1f25a2e
KW
1630 /* bdrv_open() masks this flag out */
1631 flags &= ~BDRV_O_PROTOCOL;
1632
e971aa12 1633 if (bs->file) {
f1f25a2e 1634 bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
e971aa12
JC
1635 }
1636
1637 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1638 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1639
1640 bs_entry->state.bs = bs;
1641 bs_entry->state.flags = flags;
1642
1643 return bs_queue;
1644}
1645
1646/*
1647 * Reopen multiple BlockDriverStates atomically & transactionally.
1648 *
1649 * The queue passed in (bs_queue) must have been built up previous
1650 * via bdrv_reopen_queue().
1651 *
1652 * Reopens all BDS specified in the queue, with the appropriate
1653 * flags. All devices are prepared for reopen, and failure of any
1654 * device will cause all device changes to be abandonded, and intermediate
1655 * data cleaned up.
1656 *
1657 * If all devices prepare successfully, then the changes are committed
1658 * to all devices.
1659 *
1660 */
1661int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1662{
1663 int ret = -1;
1664 BlockReopenQueueEntry *bs_entry, *next;
1665 Error *local_err = NULL;
1666
1667 assert(bs_queue != NULL);
1668
1669 bdrv_drain_all();
1670
1671 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1672 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1673 error_propagate(errp, local_err);
1674 goto cleanup;
1675 }
1676 bs_entry->prepared = true;
1677 }
1678
1679 /* If we reach this point, we have success and just need to apply the
1680 * changes
1681 */
1682 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1683 bdrv_reopen_commit(&bs_entry->state);
1684 }
1685
1686 ret = 0;
1687
1688cleanup:
1689 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1690 if (ret && bs_entry->prepared) {
1691 bdrv_reopen_abort(&bs_entry->state);
1692 }
1693 g_free(bs_entry);
1694 }
1695 g_free(bs_queue);
1696 return ret;
1697}
1698
1699
1700/* Reopen a single BlockDriverState with the specified flags. */
1701int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1702{
1703 int ret = -1;
1704 Error *local_err = NULL;
1705 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1706
1707 ret = bdrv_reopen_multiple(queue, &local_err);
1708 if (local_err != NULL) {
1709 error_propagate(errp, local_err);
1710 }
1711 return ret;
1712}
1713
1714
1715/*
1716 * Prepares a BlockDriverState for reopen. All changes are staged in the
1717 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1718 * the block driver layer .bdrv_reopen_prepare()
1719 *
1720 * bs is the BlockDriverState to reopen
1721 * flags are the new open flags
1722 * queue is the reopen queue
1723 *
1724 * Returns 0 on success, non-zero on error. On error errp will be set
1725 * as well.
1726 *
1727 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1728 * It is the responsibility of the caller to then call the abort() or
1729 * commit() for any other BDS that have been left in a prepare() state
1730 *
1731 */
1732int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1733 Error **errp)
1734{
1735 int ret = -1;
1736 Error *local_err = NULL;
1737 BlockDriver *drv;
1738
1739 assert(reopen_state != NULL);
1740 assert(reopen_state->bs->drv != NULL);
1741 drv = reopen_state->bs->drv;
1742
1743 /* if we are to stay read-only, do not allow permission change
1744 * to r/w */
1745 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1746 reopen_state->flags & BDRV_O_RDWR) {
1747 error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1748 reopen_state->bs->device_name);
1749 goto error;
1750 }
1751
1752
1753 ret = bdrv_flush(reopen_state->bs);
1754 if (ret) {
1755 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1756 strerror(-ret));
1757 goto error;
1758 }
1759
1760 if (drv->bdrv_reopen_prepare) {
1761 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1762 if (ret) {
1763 if (local_err != NULL) {
1764 error_propagate(errp, local_err);
1765 } else {
d8b6895f
LC
1766 error_setg(errp, "failed while preparing to reopen image '%s'",
1767 reopen_state->bs->filename);
e971aa12
JC
1768 }
1769 goto error;
1770 }
1771 } else {
1772 /* It is currently mandatory to have a bdrv_reopen_prepare()
1773 * handler for each supported drv. */
1774 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1775 drv->format_name, reopen_state->bs->device_name,
1776 "reopening of file");
1777 ret = -1;
1778 goto error;
1779 }
1780
1781 ret = 0;
1782
1783error:
1784 return ret;
1785}
1786
1787/*
1788 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1789 * makes them final by swapping the staging BlockDriverState contents into
1790 * the active BlockDriverState contents.
1791 */
1792void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1793{
1794 BlockDriver *drv;
1795
1796 assert(reopen_state != NULL);
1797 drv = reopen_state->bs->drv;
1798 assert(drv != NULL);
1799
1800 /* If there are any driver level actions to take */
1801 if (drv->bdrv_reopen_commit) {
1802 drv->bdrv_reopen_commit(reopen_state);
1803 }
1804
1805 /* set BDS specific flags now */
1806 reopen_state->bs->open_flags = reopen_state->flags;
1807 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1808 BDRV_O_CACHE_WB);
1809 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
355ef4ac 1810
3baca891 1811 bdrv_refresh_limits(reopen_state->bs, NULL);
e971aa12
JC
1812}
1813
1814/*
1815 * Abort the reopen, and delete and free the staged changes in
1816 * reopen_state
1817 */
1818void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1819{
1820 BlockDriver *drv;
1821
1822 assert(reopen_state != NULL);
1823 drv = reopen_state->bs->drv;
1824 assert(drv != NULL);
1825
1826 if (drv->bdrv_reopen_abort) {
1827 drv->bdrv_reopen_abort(reopen_state);
1828 }
1829}
1830
1831
fc01f7e7
FB
1832void bdrv_close(BlockDriverState *bs)
1833{
33384421
HR
1834 BdrvAioNotifier *ban, *ban_next;
1835
3cbc002c
PB
1836 if (bs->job) {
1837 block_job_cancel_sync(bs->job);
1838 }
58fda173
SH
1839 bdrv_drain_all(); /* complete I/O */
1840 bdrv_flush(bs);
1841 bdrv_drain_all(); /* in case flush left pending I/O */
d7d512f6 1842 notifier_list_notify(&bs->close_notifiers, bs);
7094f12f 1843
3cbc002c 1844 if (bs->drv) {
557df6ac 1845 if (bs->backing_hd) {
826b6ca0
FZ
1846 BlockDriverState *backing_hd = bs->backing_hd;
1847 bdrv_set_backing_hd(bs, NULL);
1848 bdrv_unref(backing_hd);
557df6ac 1849 }
ea2384d3 1850 bs->drv->bdrv_close(bs);
7267c094 1851 g_free(bs->opaque);
ea2384d3
FB
1852 bs->opaque = NULL;
1853 bs->drv = NULL;
53fec9d3 1854 bs->copy_on_read = 0;
a275fa42
PB
1855 bs->backing_file[0] = '\0';
1856 bs->backing_format[0] = '\0';
6405875c
PB
1857 bs->total_sectors = 0;
1858 bs->encrypted = 0;
1859 bs->valid_key = 0;
1860 bs->sg = 0;
1861 bs->growable = 0;
0d51b4de 1862 bs->zero_beyond_eof = false;
de9c0cec
KW
1863 QDECREF(bs->options);
1864 bs->options = NULL;
91af7014
HR
1865 QDECREF(bs->full_open_options);
1866 bs->full_open_options = NULL;
b338082b 1867
66f82cee 1868 if (bs->file != NULL) {
4f6fd349 1869 bdrv_unref(bs->file);
0ac9377d 1870 bs->file = NULL;
66f82cee 1871 }
b338082b 1872 }
98f90dba 1873
9ca11154
PH
1874 bdrv_dev_change_media_cb(bs, false);
1875
98f90dba
ZYW
1876 /*throttling disk I/O limits*/
1877 if (bs->io_limits_enabled) {
1878 bdrv_io_limits_disable(bs);
1879 }
33384421
HR
1880
1881 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1882 g_free(ban);
1883 }
1884 QLIST_INIT(&bs->aio_notifiers);
b338082b
FB
1885}
1886
2bc93fed
MK
1887void bdrv_close_all(void)
1888{
1889 BlockDriverState *bs;
1890
dc364f4c 1891 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
1892 AioContext *aio_context = bdrv_get_aio_context(bs);
1893
1894 aio_context_acquire(aio_context);
2bc93fed 1895 bdrv_close(bs);
ed78cda3 1896 aio_context_release(aio_context);
2bc93fed
MK
1897 }
1898}
1899
88266f5a
SH
1900/* Check if any requests are in-flight (including throttled requests) */
1901static bool bdrv_requests_pending(BlockDriverState *bs)
1902{
1903 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1904 return true;
1905 }
cc0681c4
BC
1906 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1907 return true;
1908 }
1909 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
88266f5a
SH
1910 return true;
1911 }
1912 if (bs->file && bdrv_requests_pending(bs->file)) {
1913 return true;
1914 }
1915 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1916 return true;
1917 }
1918 return false;
1919}
1920
922453bc
SH
1921/*
1922 * Wait for pending requests to complete across all BlockDriverStates
1923 *
1924 * This function does not flush data to disk, use bdrv_flush_all() for that
1925 * after calling this function.
4c355d53
ZYW
1926 *
1927 * Note that completion of an asynchronous I/O operation can trigger any
1928 * number of other I/O operations on other devices---for example a coroutine
1929 * can be arbitrarily complex and a constant flow of I/O can come until the
1930 * coroutine is complete. Because of this, it is not possible to have a
1931 * function to drain a single device's I/O queue.
922453bc
SH
1932 */
1933void bdrv_drain_all(void)
1934{
88266f5a
SH
1935 /* Always run first iteration so any pending completion BHs run */
1936 bool busy = true;
922453bc
SH
1937 BlockDriverState *bs;
1938
88266f5a 1939 while (busy) {
9b536adc
SH
1940 busy = false;
1941
dc364f4c 1942 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
9b536adc
SH
1943 AioContext *aio_context = bdrv_get_aio_context(bs);
1944 bool bs_busy;
1945
1946 aio_context_acquire(aio_context);
448ad91d 1947 bdrv_flush_io_queue(bs);
0b06ef3b 1948 bdrv_start_throttled_reqs(bs);
9b536adc
SH
1949 bs_busy = bdrv_requests_pending(bs);
1950 bs_busy |= aio_poll(aio_context, bs_busy);
1951 aio_context_release(aio_context);
922453bc 1952
9b536adc
SH
1953 busy |= bs_busy;
1954 }
922453bc
SH
1955 }
1956}
1957
dc364f4c
BC
1958/* make a BlockDriverState anonymous by removing from bdrv_state and
1959 * graph_bdrv_state list.
d22b2f41
RH
1960 Also, NULL terminate the device_name to prevent double remove */
1961void bdrv_make_anon(BlockDriverState *bs)
1962{
1963 if (bs->device_name[0] != '\0') {
dc364f4c 1964 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
d22b2f41
RH
1965 }
1966 bs->device_name[0] = '\0';
dc364f4c
BC
1967 if (bs->node_name[0] != '\0') {
1968 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1969 }
1970 bs->node_name[0] = '\0';
d22b2f41
RH
1971}
1972
e023b2e2
PB
1973static void bdrv_rebind(BlockDriverState *bs)
1974{
1975 if (bs->drv && bs->drv->bdrv_rebind) {
1976 bs->drv->bdrv_rebind(bs);
1977 }
1978}
1979
4ddc07ca
PB
1980static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1981 BlockDriverState *bs_src)
8802d1fd 1982{
4ddc07ca 1983 /* move some fields that need to stay attached to the device */
8802d1fd
JC
1984
1985 /* dev info */
4ddc07ca
PB
1986 bs_dest->dev_ops = bs_src->dev_ops;
1987 bs_dest->dev_opaque = bs_src->dev_opaque;
1988 bs_dest->dev = bs_src->dev;
1b7fd729 1989 bs_dest->guest_block_size = bs_src->guest_block_size;
4ddc07ca 1990 bs_dest->copy_on_read = bs_src->copy_on_read;
8802d1fd 1991
4ddc07ca 1992 bs_dest->enable_write_cache = bs_src->enable_write_cache;
c4a248a1 1993
cc0681c4
BC
1994 /* i/o throttled req */
1995 memcpy(&bs_dest->throttle_state,
1996 &bs_src->throttle_state,
1997 sizeof(ThrottleState));
1998 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
1999 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
4ddc07ca 2000 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
8802d1fd 2001
8802d1fd 2002 /* r/w error */
4ddc07ca
PB
2003 bs_dest->on_read_error = bs_src->on_read_error;
2004 bs_dest->on_write_error = bs_src->on_write_error;
8802d1fd
JC
2005
2006 /* i/o status */
4ddc07ca
PB
2007 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
2008 bs_dest->iostatus = bs_src->iostatus;
8802d1fd 2009
a9fc4408 2010 /* dirty bitmap */
e4654d2d 2011 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
a9fc4408 2012
9fcb0251
FZ
2013 /* reference count */
2014 bs_dest->refcnt = bs_src->refcnt;
2015
a9fc4408 2016 /* job */
4ddc07ca 2017 bs_dest->job = bs_src->job;
a9fc4408 2018
8802d1fd 2019 /* keep the same entry in bdrv_states */
4ddc07ca
PB
2020 pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
2021 bs_src->device_name);
dc364f4c 2022 bs_dest->device_list = bs_src->device_list;
fbe40ff7
FZ
2023 memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2024 sizeof(bs_dest->op_blockers));
4ddc07ca 2025}
8802d1fd 2026
4ddc07ca
PB
2027/*
2028 * Swap bs contents for two image chains while they are live,
2029 * while keeping required fields on the BlockDriverState that is
2030 * actually attached to a device.
2031 *
2032 * This will modify the BlockDriverState fields, and swap contents
2033 * between bs_new and bs_old. Both bs_new and bs_old are modified.
2034 *
2035 * bs_new is required to be anonymous.
2036 *
2037 * This function does not create any image files.
2038 */
2039void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2040{
2041 BlockDriverState tmp;
f6801b83 2042
90ce8a06
BC
2043 /* The code needs to swap the node_name but simply swapping node_list won't
2044 * work so first remove the nodes from the graph list, do the swap then
2045 * insert them back if needed.
2046 */
2047 if (bs_new->node_name[0] != '\0') {
2048 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2049 }
2050 if (bs_old->node_name[0] != '\0') {
2051 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2052 }
2053
4ddc07ca
PB
2054 /* bs_new must be anonymous and shouldn't have anything fancy enabled */
2055 assert(bs_new->device_name[0] == '\0');
e4654d2d 2056 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
4ddc07ca
PB
2057 assert(bs_new->job == NULL);
2058 assert(bs_new->dev == NULL);
4ddc07ca 2059 assert(bs_new->io_limits_enabled == false);
cc0681c4 2060 assert(!throttle_have_timer(&bs_new->throttle_state));
8802d1fd 2061
4ddc07ca
PB
2062 tmp = *bs_new;
2063 *bs_new = *bs_old;
2064 *bs_old = tmp;
a9fc4408 2065
4ddc07ca
PB
2066 /* there are some fields that should not be swapped, move them back */
2067 bdrv_move_feature_fields(&tmp, bs_old);
2068 bdrv_move_feature_fields(bs_old, bs_new);
2069 bdrv_move_feature_fields(bs_new, &tmp);
8802d1fd 2070
4ddc07ca
PB
2071 /* bs_new shouldn't be in bdrv_states even after the swap! */
2072 assert(bs_new->device_name[0] == '\0');
2073
2074 /* Check a few fields that should remain attached to the device */
2075 assert(bs_new->dev == NULL);
2076 assert(bs_new->job == NULL);
4ddc07ca 2077 assert(bs_new->io_limits_enabled == false);
cc0681c4 2078 assert(!throttle_have_timer(&bs_new->throttle_state));
e023b2e2 2079
90ce8a06
BC
2080 /* insert the nodes back into the graph node list if needed */
2081 if (bs_new->node_name[0] != '\0') {
2082 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2083 }
2084 if (bs_old->node_name[0] != '\0') {
2085 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2086 }
2087
e023b2e2 2088 bdrv_rebind(bs_new);
4ddc07ca
PB
2089 bdrv_rebind(bs_old);
2090}
2091
2092/*
2093 * Add new bs contents at the top of an image chain while the chain is
2094 * live, while keeping required fields on the top layer.
2095 *
2096 * This will modify the BlockDriverState fields, and swap contents
2097 * between bs_new and bs_top. Both bs_new and bs_top are modified.
2098 *
2099 * bs_new is required to be anonymous.
2100 *
2101 * This function does not create any image files.
2102 */
2103void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2104{
2105 bdrv_swap(bs_new, bs_top);
2106
2107 /* The contents of 'tmp' will become bs_top, as we are
2108 * swapping bs_new and bs_top contents. */
8d24cce1 2109 bdrv_set_backing_hd(bs_top, bs_new);
8802d1fd
JC
2110}
2111
4f6fd349 2112static void bdrv_delete(BlockDriverState *bs)
b338082b 2113{
fa879d62 2114 assert(!bs->dev);
3e914655 2115 assert(!bs->job);
3718d8ab 2116 assert(bdrv_op_blocker_is_empty(bs));
4f6fd349 2117 assert(!bs->refcnt);
e4654d2d 2118 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
18846dee 2119
e1b5c52e
SH
2120 bdrv_close(bs);
2121
1b7bdbc1 2122 /* remove from list, if necessary */
d22b2f41 2123 bdrv_make_anon(bs);
34c6f050 2124
3ae59580 2125 drive_info_del(drive_get_by_blockdev(bs));
7267c094 2126 g_free(bs);
fc01f7e7
FB
2127}
2128
fa879d62
MA
2129int bdrv_attach_dev(BlockDriverState *bs, void *dev)
2130/* TODO change to DeviceState *dev when all users are qdevified */
18846dee 2131{
fa879d62 2132 if (bs->dev) {
18846dee
MA
2133 return -EBUSY;
2134 }
fa879d62 2135 bs->dev = dev;
28a7282a 2136 bdrv_iostatus_reset(bs);
2a87151f
SH
2137
2138 /* We're expecting I/O from the device so bump up coroutine pool size */
2139 qemu_coroutine_adjust_pool_size(COROUTINE_POOL_RESERVATION);
18846dee
MA
2140 return 0;
2141}
2142
fa879d62
MA
2143/* TODO qdevified devices don't use this, remove when devices are qdevified */
2144void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
18846dee 2145{
fa879d62
MA
2146 if (bdrv_attach_dev(bs, dev) < 0) {
2147 abort();
2148 }
2149}
2150
2151void bdrv_detach_dev(BlockDriverState *bs, void *dev)
2152/* TODO change to DeviceState *dev when all users are qdevified */
2153{
2154 assert(bs->dev == dev);
2155 bs->dev = NULL;
0e49de52
MA
2156 bs->dev_ops = NULL;
2157 bs->dev_opaque = NULL;
1b7fd729 2158 bs->guest_block_size = 512;
2a87151f 2159 qemu_coroutine_adjust_pool_size(-COROUTINE_POOL_RESERVATION);
18846dee
MA
2160}
2161
fa879d62
MA
2162/* TODO change to return DeviceState * when all users are qdevified */
2163void *bdrv_get_attached_dev(BlockDriverState *bs)
18846dee 2164{
fa879d62 2165 return bs->dev;
18846dee
MA
2166}
2167
0e49de52
MA
2168void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
2169 void *opaque)
2170{
2171 bs->dev_ops = ops;
2172 bs->dev_opaque = opaque;
2173}
2174
7d4b4ba5 2175static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
0e49de52 2176{
145feb17 2177 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
6f382ed2 2178 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
7d4b4ba5 2179 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
6f382ed2
LC
2180 if (tray_was_closed) {
2181 /* tray open */
a5ee7bd4
WX
2182 qapi_event_send_device_tray_moved(bdrv_get_device_name(bs),
2183 true, &error_abort);
6f382ed2
LC
2184 }
2185 if (load) {
2186 /* tray close */
a5ee7bd4
WX
2187 qapi_event_send_device_tray_moved(bdrv_get_device_name(bs),
2188 false, &error_abort);
6f382ed2 2189 }
145feb17
MA
2190 }
2191}
2192
2c6942fa
MA
2193bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2194{
2195 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2196}
2197
025ccaa7
PB
2198void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2199{
2200 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2201 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2202 }
2203}
2204
e4def80b
MA
2205bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2206{
2207 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2208 return bs->dev_ops->is_tray_open(bs->dev_opaque);
2209 }
2210 return false;
2211}
2212
145feb17
MA
2213static void bdrv_dev_resize_cb(BlockDriverState *bs)
2214{
2215 if (bs->dev_ops && bs->dev_ops->resize_cb) {
2216 bs->dev_ops->resize_cb(bs->dev_opaque);
0e49de52
MA
2217 }
2218}
2219
f107639a
MA
2220bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2221{
2222 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2223 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2224 }
2225 return false;
2226}
2227
e97fc193
AL
2228/*
2229 * Run consistency checks on an image
2230 *
e076f338 2231 * Returns 0 if the check could be completed (it doesn't mean that the image is
a1c7273b 2232 * free of errors) or -errno when an internal error occurred. The results of the
e076f338 2233 * check are stored in res.
e97fc193 2234 */
4534ff54 2235int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
e97fc193 2236{
908bcd54
HR
2237 if (bs->drv == NULL) {
2238 return -ENOMEDIUM;
2239 }
e97fc193
AL
2240 if (bs->drv->bdrv_check == NULL) {
2241 return -ENOTSUP;
2242 }
2243
e076f338 2244 memset(res, 0, sizeof(*res));
4534ff54 2245 return bs->drv->bdrv_check(bs, res, fix);
e97fc193
AL
2246}
2247
8a426614
KW
2248#define COMMIT_BUF_SECTORS 2048
2249
33e3963e
FB
2250/* commit COW file into the raw image */
2251int bdrv_commit(BlockDriverState *bs)
2252{
19cb3738 2253 BlockDriver *drv = bs->drv;
72706ea4 2254 int64_t sector, total_sectors, length, backing_length;
8a426614 2255 int n, ro, open_flags;
0bce597d 2256 int ret = 0;
72706ea4 2257 uint8_t *buf = NULL;
c2cba3d9 2258 char filename[PATH_MAX];
33e3963e 2259
19cb3738
FB
2260 if (!drv)
2261 return -ENOMEDIUM;
6bb45158 2262
4dca4b63
NS
2263 if (!bs->backing_hd) {
2264 return -ENOTSUP;
33e3963e
FB
2265 }
2266
3718d8ab
FZ
2267 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
2268 bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
2d3735d3
SH
2269 return -EBUSY;
2270 }
2271
4dca4b63 2272 ro = bs->backing_hd->read_only;
c2cba3d9
JM
2273 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2274 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
4dca4b63
NS
2275 open_flags = bs->backing_hd->open_flags;
2276
2277 if (ro) {
0bce597d
JC
2278 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2279 return -EACCES;
4dca4b63 2280 }
ea2384d3 2281 }
33e3963e 2282
72706ea4
JC
2283 length = bdrv_getlength(bs);
2284 if (length < 0) {
2285 ret = length;
2286 goto ro_cleanup;
2287 }
2288
2289 backing_length = bdrv_getlength(bs->backing_hd);
2290 if (backing_length < 0) {
2291 ret = backing_length;
2292 goto ro_cleanup;
2293 }
2294
2295 /* If our top snapshot is larger than the backing file image,
2296 * grow the backing file image if possible. If not possible,
2297 * we must return an error */
2298 if (length > backing_length) {
2299 ret = bdrv_truncate(bs->backing_hd, length);
2300 if (ret < 0) {
2301 goto ro_cleanup;
2302 }
2303 }
2304
2305 total_sectors = length >> BDRV_SECTOR_BITS;
857d4f46
KW
2306
2307 /* qemu_try_blockalign() for bs will choose an alignment that works for
2308 * bs->backing_hd as well, so no need to compare the alignment manually. */
2309 buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2310 if (buf == NULL) {
2311 ret = -ENOMEM;
2312 goto ro_cleanup;
2313 }
8a426614
KW
2314
2315 for (sector = 0; sector < total_sectors; sector += n) {
d663640c
PB
2316 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2317 if (ret < 0) {
2318 goto ro_cleanup;
2319 }
2320 if (ret) {
dabfa6cc
KW
2321 ret = bdrv_read(bs, sector, buf, n);
2322 if (ret < 0) {
8a426614
KW
2323 goto ro_cleanup;
2324 }
2325
dabfa6cc
KW
2326 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2327 if (ret < 0) {
8a426614
KW
2328 goto ro_cleanup;
2329 }
ea2384d3 2330 }
33e3963e 2331 }
95389c86 2332
1d44952f
CH
2333 if (drv->bdrv_make_empty) {
2334 ret = drv->bdrv_make_empty(bs);
dabfa6cc
KW
2335 if (ret < 0) {
2336 goto ro_cleanup;
2337 }
1d44952f
CH
2338 bdrv_flush(bs);
2339 }
95389c86 2340
3f5075ae
CH
2341 /*
2342 * Make sure all data we wrote to the backing device is actually
2343 * stable on disk.
2344 */
dabfa6cc 2345 if (bs->backing_hd) {
3f5075ae 2346 bdrv_flush(bs->backing_hd);
dabfa6cc 2347 }
4dca4b63 2348
dabfa6cc 2349 ret = 0;
4dca4b63 2350ro_cleanup:
857d4f46 2351 qemu_vfree(buf);
4dca4b63
NS
2352
2353 if (ro) {
0bce597d
JC
2354 /* ignoring error return here */
2355 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
4dca4b63
NS
2356 }
2357
1d44952f 2358 return ret;
33e3963e
FB
2359}
2360
e8877497 2361int bdrv_commit_all(void)
6ab4b5ab
MA
2362{
2363 BlockDriverState *bs;
2364
dc364f4c 2365 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
2366 AioContext *aio_context = bdrv_get_aio_context(bs);
2367
2368 aio_context_acquire(aio_context);
272d2d8e
JC
2369 if (bs->drv && bs->backing_hd) {
2370 int ret = bdrv_commit(bs);
2371 if (ret < 0) {
ed78cda3 2372 aio_context_release(aio_context);
272d2d8e
JC
2373 return ret;
2374 }
e8877497 2375 }
ed78cda3 2376 aio_context_release(aio_context);
6ab4b5ab 2377 }
e8877497 2378 return 0;
6ab4b5ab
MA
2379}
2380
dbffbdcf
SH
2381/**
2382 * Remove an active request from the tracked requests list
2383 *
2384 * This function should be called when a tracked request is completing.
2385 */
2386static void tracked_request_end(BdrvTrackedRequest *req)
2387{
2dbafdc0
KW
2388 if (req->serialising) {
2389 req->bs->serialising_in_flight--;
2390 }
2391
dbffbdcf 2392 QLIST_REMOVE(req, list);
f4658285 2393 qemu_co_queue_restart_all(&req->wait_queue);
dbffbdcf
SH
2394}
2395
2396/**
2397 * Add an active request to the tracked requests list
2398 */
2399static void tracked_request_begin(BdrvTrackedRequest *req,
2400 BlockDriverState *bs,
793ed47a
KW
2401 int64_t offset,
2402 unsigned int bytes, bool is_write)
dbffbdcf
SH
2403{
2404 *req = (BdrvTrackedRequest){
2405 .bs = bs,
2dbafdc0
KW
2406 .offset = offset,
2407 .bytes = bytes,
2408 .is_write = is_write,
2409 .co = qemu_coroutine_self(),
2410 .serialising = false,
7327145f
KW
2411 .overlap_offset = offset,
2412 .overlap_bytes = bytes,
dbffbdcf
SH
2413 };
2414
f4658285
SH
2415 qemu_co_queue_init(&req->wait_queue);
2416
dbffbdcf
SH
2417 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2418}
2419
e96126ff 2420static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2dbafdc0 2421{
7327145f 2422 int64_t overlap_offset = req->offset & ~(align - 1);
e96126ff
KW
2423 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2424 - overlap_offset;
7327145f 2425
2dbafdc0
KW
2426 if (!req->serialising) {
2427 req->bs->serialising_in_flight++;
2428 req->serialising = true;
2429 }
7327145f
KW
2430
2431 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2432 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2dbafdc0
KW
2433}
2434
d83947ac
SH
2435/**
2436 * Round a region to cluster boundaries
2437 */
343bded4
PB
2438void bdrv_round_to_clusters(BlockDriverState *bs,
2439 int64_t sector_num, int nb_sectors,
2440 int64_t *cluster_sector_num,
2441 int *cluster_nb_sectors)
d83947ac
SH
2442{
2443 BlockDriverInfo bdi;
2444
2445 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2446 *cluster_sector_num = sector_num;
2447 *cluster_nb_sectors = nb_sectors;
2448 } else {
2449 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2450 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2451 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2452 nb_sectors, c);
2453 }
2454}
2455
7327145f 2456static int bdrv_get_cluster_size(BlockDriverState *bs)
793ed47a
KW
2457{
2458 BlockDriverInfo bdi;
7327145f 2459 int ret;
793ed47a 2460
7327145f
KW
2461 ret = bdrv_get_info(bs, &bdi);
2462 if (ret < 0 || bdi.cluster_size == 0) {
2463 return bs->request_alignment;
793ed47a 2464 } else {
7327145f 2465 return bdi.cluster_size;
793ed47a
KW
2466 }
2467}
2468
f4658285 2469static bool tracked_request_overlaps(BdrvTrackedRequest *req,
793ed47a
KW
2470 int64_t offset, unsigned int bytes)
2471{
d83947ac 2472 /* aaaa bbbb */
7327145f 2473 if (offset >= req->overlap_offset + req->overlap_bytes) {
d83947ac
SH
2474 return false;
2475 }
2476 /* bbbb aaaa */
7327145f 2477 if (req->overlap_offset >= offset + bytes) {
d83947ac
SH
2478 return false;
2479 }
2480 return true;
f4658285
SH
2481}
2482
28de2dcd 2483static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
f4658285 2484{
2dbafdc0 2485 BlockDriverState *bs = self->bs;
f4658285
SH
2486 BdrvTrackedRequest *req;
2487 bool retry;
28de2dcd 2488 bool waited = false;
f4658285 2489
2dbafdc0 2490 if (!bs->serialising_in_flight) {
28de2dcd 2491 return false;
2dbafdc0
KW
2492 }
2493
f4658285
SH
2494 do {
2495 retry = false;
2496 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2dbafdc0 2497 if (req == self || (!req->serialising && !self->serialising)) {
65afd211
KW
2498 continue;
2499 }
7327145f
KW
2500 if (tracked_request_overlaps(req, self->overlap_offset,
2501 self->overlap_bytes))
2502 {
5f8b6491
SH
2503 /* Hitting this means there was a reentrant request, for
2504 * example, a block driver issuing nested requests. This must
2505 * never happen since it means deadlock.
2506 */
2507 assert(qemu_coroutine_self() != req->co);
2508
6460440f
KW
2509 /* If the request is already (indirectly) waiting for us, or
2510 * will wait for us as soon as it wakes up, then just go on
2511 * (instead of producing a deadlock in the former case). */
2512 if (!req->waiting_for) {
2513 self->waiting_for = req;
2514 qemu_co_queue_wait(&req->wait_queue);
2515 self->waiting_for = NULL;
2516 retry = true;
28de2dcd 2517 waited = true;
6460440f
KW
2518 break;
2519 }
f4658285
SH
2520 }
2521 }
2522 } while (retry);
28de2dcd
KW
2523
2524 return waited;
f4658285
SH
2525}
2526
756e6736
KW
2527/*
2528 * Return values:
2529 * 0 - success
2530 * -EINVAL - backing format specified, but no file
2531 * -ENOSPC - can't update the backing file because no space is left in the
2532 * image file header
2533 * -ENOTSUP - format driver doesn't support changing the backing file
2534 */
2535int bdrv_change_backing_file(BlockDriverState *bs,
2536 const char *backing_file, const char *backing_fmt)
2537{
2538 BlockDriver *drv = bs->drv;
469ef350 2539 int ret;
756e6736 2540
5f377794
PB
2541 /* Backing file format doesn't make sense without a backing file */
2542 if (backing_fmt && !backing_file) {
2543 return -EINVAL;
2544 }
2545
756e6736 2546 if (drv->bdrv_change_backing_file != NULL) {
469ef350 2547 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
756e6736 2548 } else {
469ef350 2549 ret = -ENOTSUP;
756e6736 2550 }
469ef350
PB
2551
2552 if (ret == 0) {
2553 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2554 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2555 }
2556 return ret;
756e6736
KW
2557}
2558
6ebdcee2
JC
2559/*
2560 * Finds the image layer in the chain that has 'bs' as its backing file.
2561 *
2562 * active is the current topmost image.
2563 *
2564 * Returns NULL if bs is not found in active's image chain,
2565 * or if active == bs.
4caf0fcd
JC
2566 *
2567 * Returns the bottommost base image if bs == NULL.
6ebdcee2
JC
2568 */
2569BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2570 BlockDriverState *bs)
2571{
4caf0fcd
JC
2572 while (active && bs != active->backing_hd) {
2573 active = active->backing_hd;
6ebdcee2
JC
2574 }
2575
4caf0fcd
JC
2576 return active;
2577}
6ebdcee2 2578
4caf0fcd
JC
2579/* Given a BDS, searches for the base layer. */
2580BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2581{
2582 return bdrv_find_overlay(bs, NULL);
6ebdcee2
JC
2583}
2584
2585typedef struct BlkIntermediateStates {
2586 BlockDriverState *bs;
2587 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2588} BlkIntermediateStates;
2589
2590
2591/*
2592 * Drops images above 'base' up to and including 'top', and sets the image
2593 * above 'top' to have base as its backing file.
2594 *
2595 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2596 * information in 'bs' can be properly updated.
2597 *
2598 * E.g., this will convert the following chain:
2599 * bottom <- base <- intermediate <- top <- active
2600 *
2601 * to
2602 *
2603 * bottom <- base <- active
2604 *
2605 * It is allowed for bottom==base, in which case it converts:
2606 *
2607 * base <- intermediate <- top <- active
2608 *
2609 * to
2610 *
2611 * base <- active
2612 *
54e26900
JC
2613 * If backing_file_str is non-NULL, it will be used when modifying top's
2614 * overlay image metadata.
2615 *
6ebdcee2
JC
2616 * Error conditions:
2617 * if active == top, that is considered an error
2618 *
2619 */
2620int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
54e26900 2621 BlockDriverState *base, const char *backing_file_str)
6ebdcee2
JC
2622{
2623 BlockDriverState *intermediate;
2624 BlockDriverState *base_bs = NULL;
2625 BlockDriverState *new_top_bs = NULL;
2626 BlkIntermediateStates *intermediate_state, *next;
2627 int ret = -EIO;
2628
2629 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2630 QSIMPLEQ_INIT(&states_to_delete);
2631
2632 if (!top->drv || !base->drv) {
2633 goto exit;
2634 }
2635
2636 new_top_bs = bdrv_find_overlay(active, top);
2637
2638 if (new_top_bs == NULL) {
2639 /* we could not find the image above 'top', this is an error */
2640 goto exit;
2641 }
2642
2643 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2644 * to do, no intermediate images */
2645 if (new_top_bs->backing_hd == base) {
2646 ret = 0;
2647 goto exit;
2648 }
2649
2650 intermediate = top;
2651
2652 /* now we will go down through the list, and add each BDS we find
2653 * into our deletion queue, until we hit the 'base'
2654 */
2655 while (intermediate) {
5839e53b 2656 intermediate_state = g_new0(BlkIntermediateStates, 1);
6ebdcee2
JC
2657 intermediate_state->bs = intermediate;
2658 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2659
2660 if (intermediate->backing_hd == base) {
2661 base_bs = intermediate->backing_hd;
2662 break;
2663 }
2664 intermediate = intermediate->backing_hd;
2665 }
2666 if (base_bs == NULL) {
2667 /* something went wrong, we did not end at the base. safely
2668 * unravel everything, and exit with error */
2669 goto exit;
2670 }
2671
2672 /* success - we can delete the intermediate states, and link top->base */
54e26900
JC
2673 backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2674 ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
6ebdcee2
JC
2675 base_bs->drv ? base_bs->drv->format_name : "");
2676 if (ret) {
2677 goto exit;
2678 }
920beae1 2679 bdrv_set_backing_hd(new_top_bs, base_bs);
6ebdcee2
JC
2680
2681 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2682 /* so that bdrv_close() does not recursively close the chain */
920beae1 2683 bdrv_set_backing_hd(intermediate_state->bs, NULL);
4f6fd349 2684 bdrv_unref(intermediate_state->bs);
6ebdcee2
JC
2685 }
2686 ret = 0;
2687
2688exit:
2689 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2690 g_free(intermediate_state);
2691 }
2692 return ret;
2693}
2694
2695
71d0770c
AL
2696static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2697 size_t size)
2698{
2699 int64_t len;
2700
1dd3a447
KW
2701 if (size > INT_MAX) {
2702 return -EIO;
2703 }
2704
71d0770c
AL
2705 if (!bdrv_is_inserted(bs))
2706 return -ENOMEDIUM;
2707
2708 if (bs->growable)
2709 return 0;
2710
2711 len = bdrv_getlength(bs);
2712
fbb7b4e0
KW
2713 if (offset < 0)
2714 return -EIO;
2715
2716 if ((offset > len) || (len - offset < size))
71d0770c
AL
2717 return -EIO;
2718
2719 return 0;
2720}
2721
2722static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2723 int nb_sectors)
2724{
54db38a4 2725 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
8f4754ed
KW
2726 return -EIO;
2727 }
2728
eb5a3165
JS
2729 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2730 nb_sectors * BDRV_SECTOR_SIZE);
71d0770c
AL
2731}
2732
1c9805a3
SH
2733typedef struct RwCo {
2734 BlockDriverState *bs;
775aa8b6 2735 int64_t offset;
1c9805a3
SH
2736 QEMUIOVector *qiov;
2737 bool is_write;
2738 int ret;
4105eaaa 2739 BdrvRequestFlags flags;
1c9805a3
SH
2740} RwCo;
2741
2742static void coroutine_fn bdrv_rw_co_entry(void *opaque)
fc01f7e7 2743{
1c9805a3 2744 RwCo *rwco = opaque;
ea2384d3 2745
1c9805a3 2746 if (!rwco->is_write) {
775aa8b6
KW
2747 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2748 rwco->qiov->size, rwco->qiov,
4105eaaa 2749 rwco->flags);
775aa8b6
KW
2750 } else {
2751 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2752 rwco->qiov->size, rwco->qiov,
2753 rwco->flags);
1c9805a3
SH
2754 }
2755}
e7a8a783 2756
1c9805a3 2757/*
8d3b1a2d 2758 * Process a vectored synchronous request using coroutines
1c9805a3 2759 */
775aa8b6
KW
2760static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2761 QEMUIOVector *qiov, bool is_write,
2762 BdrvRequestFlags flags)
1c9805a3 2763{
1c9805a3
SH
2764 Coroutine *co;
2765 RwCo rwco = {
2766 .bs = bs,
775aa8b6 2767 .offset = offset,
8d3b1a2d 2768 .qiov = qiov,
1c9805a3
SH
2769 .is_write = is_write,
2770 .ret = NOT_DONE,
4105eaaa 2771 .flags = flags,
1c9805a3 2772 };
e7a8a783 2773
498e386c
ZYW
2774 /**
2775 * In sync call context, when the vcpu is blocked, this throttling timer
2776 * will not fire; so the I/O throttling function has to be disabled here
2777 * if it has been enabled.
2778 */
2779 if (bs->io_limits_enabled) {
2780 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2781 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2782 bdrv_io_limits_disable(bs);
2783 }
2784
1c9805a3
SH
2785 if (qemu_in_coroutine()) {
2786 /* Fast-path if already in coroutine context */
2787 bdrv_rw_co_entry(&rwco);
2788 } else {
2572b37a
SH
2789 AioContext *aio_context = bdrv_get_aio_context(bs);
2790
1c9805a3
SH
2791 co = qemu_coroutine_create(bdrv_rw_co_entry);
2792 qemu_coroutine_enter(co, &rwco);
2793 while (rwco.ret == NOT_DONE) {
2572b37a 2794 aio_poll(aio_context, true);
1c9805a3
SH
2795 }
2796 }
2797 return rwco.ret;
2798}
b338082b 2799
8d3b1a2d
KW
2800/*
2801 * Process a synchronous request using coroutines
2802 */
2803static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
4105eaaa 2804 int nb_sectors, bool is_write, BdrvRequestFlags flags)
8d3b1a2d
KW
2805{
2806 QEMUIOVector qiov;
2807 struct iovec iov = {
2808 .iov_base = (void *)buf,
2809 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2810 };
2811
da15ee51
KW
2812 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2813 return -EINVAL;
2814 }
2815
8d3b1a2d 2816 qemu_iovec_init_external(&qiov, &iov, 1);
775aa8b6
KW
2817 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2818 &qiov, is_write, flags);
8d3b1a2d
KW
2819}
2820
1c9805a3
SH
2821/* return < 0 if error. See bdrv_write() for the return codes */
2822int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2823 uint8_t *buf, int nb_sectors)
2824{
4105eaaa 2825 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
fc01f7e7
FB
2826}
2827
07d27a44
MA
2828/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2829int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2830 uint8_t *buf, int nb_sectors)
2831{
2832 bool enabled;
2833 int ret;
2834
2835 enabled = bs->io_limits_enabled;
2836 bs->io_limits_enabled = false;
4e7395e8 2837 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
07d27a44
MA
2838 bs->io_limits_enabled = enabled;
2839 return ret;
2840}
2841
5fafdf24 2842/* Return < 0 if error. Important errors are:
19cb3738
FB
2843 -EIO generic I/O error (may happen for all errors)
2844 -ENOMEDIUM No media inserted.
2845 -EINVAL Invalid sector number or nb_sectors
2846 -EACCES Trying to write a read-only device
2847*/
5fafdf24 2848int bdrv_write(BlockDriverState *bs, int64_t sector_num,
fc01f7e7
FB
2849 const uint8_t *buf, int nb_sectors)
2850{
4105eaaa 2851 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
83f64091
FB
2852}
2853
aa7bfbff
PL
2854int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2855 int nb_sectors, BdrvRequestFlags flags)
4105eaaa
PL
2856{
2857 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
aa7bfbff 2858 BDRV_REQ_ZERO_WRITE | flags);
8d3b1a2d
KW
2859}
2860
d75cbb5e
PL
2861/*
2862 * Completely zero out a block device with the help of bdrv_write_zeroes.
2863 * The operation is sped up by checking the block status and only writing
2864 * zeroes to the device if they currently do not return zeroes. Optional
2865 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2866 *
2867 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2868 */
2869int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2870{
d32f7c10 2871 int64_t target_sectors, ret, nb_sectors, sector_num = 0;
d75cbb5e
PL
2872 int n;
2873
d32f7c10
MA
2874 target_sectors = bdrv_nb_sectors(bs);
2875 if (target_sectors < 0) {
2876 return target_sectors;
9ce10c0b 2877 }
9ce10c0b 2878
d75cbb5e 2879 for (;;) {
d32f7c10 2880 nb_sectors = target_sectors - sector_num;
d75cbb5e
PL
2881 if (nb_sectors <= 0) {
2882 return 0;
2883 }
2884 if (nb_sectors > INT_MAX) {
2885 nb_sectors = INT_MAX;
2886 }
2887 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
3d94ce60
PL
2888 if (ret < 0) {
2889 error_report("error getting block status at sector %" PRId64 ": %s",
2890 sector_num, strerror(-ret));
2891 return ret;
2892 }
d75cbb5e
PL
2893 if (ret & BDRV_BLOCK_ZERO) {
2894 sector_num += n;
2895 continue;
2896 }
2897 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2898 if (ret < 0) {
2899 error_report("error writing zeroes at sector %" PRId64 ": %s",
2900 sector_num, strerror(-ret));
2901 return ret;
2902 }
2903 sector_num += n;
2904 }
2905}
2906
a3ef6571 2907int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
83f64091 2908{
a3ef6571
KW
2909 QEMUIOVector qiov;
2910 struct iovec iov = {
2911 .iov_base = (void *)buf,
2912 .iov_len = bytes,
2913 };
9a8c4cce 2914 int ret;
83f64091 2915
a3ef6571
KW
2916 if (bytes < 0) {
2917 return -EINVAL;
83f64091
FB
2918 }
2919
a3ef6571
KW
2920 qemu_iovec_init_external(&qiov, &iov, 1);
2921 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2922 if (ret < 0) {
2923 return ret;
83f64091 2924 }
a3ef6571
KW
2925
2926 return bytes;
83f64091
FB
2927}
2928
8d3b1a2d 2929int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
83f64091 2930{
9a8c4cce 2931 int ret;
83f64091 2932
8407d5d7
KW
2933 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2934 if (ret < 0) {
2935 return ret;
83f64091
FB
2936 }
2937
8d3b1a2d
KW
2938 return qiov->size;
2939}
2940
2941int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
8407d5d7 2942 const void *buf, int bytes)
8d3b1a2d
KW
2943{
2944 QEMUIOVector qiov;
2945 struct iovec iov = {
2946 .iov_base = (void *) buf,
8407d5d7 2947 .iov_len = bytes,
8d3b1a2d
KW
2948 };
2949
8407d5d7
KW
2950 if (bytes < 0) {
2951 return -EINVAL;
2952 }
2953
8d3b1a2d
KW
2954 qemu_iovec_init_external(&qiov, &iov, 1);
2955 return bdrv_pwritev(bs, offset, &qiov);
83f64091 2956}
83f64091 2957
f08145fe
KW
2958/*
2959 * Writes to the file and ensures that no writes are reordered across this
2960 * request (acts as a barrier)
2961 *
2962 * Returns 0 on success, -errno in error cases.
2963 */
2964int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2965 const void *buf, int count)
2966{
2967 int ret;
2968
2969 ret = bdrv_pwrite(bs, offset, buf, count);
2970 if (ret < 0) {
2971 return ret;
2972 }
2973
f05fa4ad
PB
2974 /* No flush needed for cache modes that already do it */
2975 if (bs->enable_write_cache) {
f08145fe
KW
2976 bdrv_flush(bs);
2977 }
2978
2979 return 0;
2980}
2981
470c0504 2982static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
ab185921
SH
2983 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2984{
2985 /* Perform I/O through a temporary buffer so that users who scribble over
2986 * their read buffer while the operation is in progress do not end up
2987 * modifying the image file. This is critical for zero-copy guest I/O
2988 * where anything might happen inside guest memory.
2989 */
2990 void *bounce_buffer;
2991
79c053bd 2992 BlockDriver *drv = bs->drv;
ab185921
SH
2993 struct iovec iov;
2994 QEMUIOVector bounce_qiov;
2995 int64_t cluster_sector_num;
2996 int cluster_nb_sectors;
2997 size_t skip_bytes;
2998 int ret;
2999
3000 /* Cover entire cluster so no additional backing file I/O is required when
3001 * allocating cluster in the image file.
3002 */
343bded4
PB
3003 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
3004 &cluster_sector_num, &cluster_nb_sectors);
ab185921 3005
470c0504
SH
3006 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
3007 cluster_sector_num, cluster_nb_sectors);
ab185921
SH
3008
3009 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
857d4f46
KW
3010 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
3011 if (bounce_buffer == NULL) {
3012 ret = -ENOMEM;
3013 goto err;
3014 }
3015
ab185921
SH
3016 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
3017
79c053bd
SH
3018 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
3019 &bounce_qiov);
ab185921
SH
3020 if (ret < 0) {
3021 goto err;
3022 }
3023
79c053bd
SH
3024 if (drv->bdrv_co_write_zeroes &&
3025 buffer_is_zero(bounce_buffer, iov.iov_len)) {
621f0589 3026 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
aa7bfbff 3027 cluster_nb_sectors, 0);
79c053bd 3028 } else {
f05fa4ad
PB
3029 /* This does not change the data on the disk, it is not necessary
3030 * to flush even in cache=writethrough mode.
3031 */
79c053bd 3032 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
ab185921 3033 &bounce_qiov);
79c053bd
SH
3034 }
3035
ab185921
SH
3036 if (ret < 0) {
3037 /* It might be okay to ignore write errors for guest requests. If this
3038 * is a deliberate copy-on-read then we don't want to ignore the error.
3039 * Simply report it in all cases.
3040 */
3041 goto err;
3042 }
3043
3044 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
03396148
MT
3045 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
3046 nb_sectors * BDRV_SECTOR_SIZE);
ab185921
SH
3047
3048err:
3049 qemu_vfree(bounce_buffer);
3050 return ret;
3051}
3052
c5fbe571 3053/*
d0c7f642
KW
3054 * Forwards an already correctly aligned request to the BlockDriver. This
3055 * handles copy on read and zeroing after EOF; any other features must be
3056 * implemented by the caller.
c5fbe571 3057 */
d0c7f642 3058static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
65afd211 3059 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
ec746e10 3060 int64_t align, QEMUIOVector *qiov, int flags)
da1fa91d
KW
3061{
3062 BlockDriver *drv = bs->drv;
dbffbdcf 3063 int ret;
da1fa91d 3064
d0c7f642
KW
3065 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3066 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
da1fa91d 3067
d0c7f642
KW
3068 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3069 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
8eb029c2 3070 assert(!qiov || bytes == qiov->size);
d0c7f642
KW
3071
3072 /* Handle Copy on Read and associated serialisation */
470c0504 3073 if (flags & BDRV_REQ_COPY_ON_READ) {
7327145f
KW
3074 /* If we touch the same cluster it counts as an overlap. This
3075 * guarantees that allocating writes will be serialized and not race
3076 * with each other for the same cluster. For example, in copy-on-read
3077 * it ensures that the CoR read and write operations are atomic and
3078 * guest writes cannot interleave between them. */
3079 mark_request_serialising(req, bdrv_get_cluster_size(bs));
470c0504
SH
3080 }
3081
2dbafdc0 3082 wait_serialising_requests(req);
f4658285 3083
470c0504 3084 if (flags & BDRV_REQ_COPY_ON_READ) {
ab185921
SH
3085 int pnum;
3086
bdad13b9 3087 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
ab185921
SH
3088 if (ret < 0) {
3089 goto out;
3090 }
3091
3092 if (!ret || pnum != nb_sectors) {
470c0504 3093 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
ab185921
SH
3094 goto out;
3095 }
3096 }
3097
d0c7f642 3098 /* Forward the request to the BlockDriver */
893a8f62
MK
3099 if (!(bs->zero_beyond_eof && bs->growable)) {
3100 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3101 } else {
3102 /* Read zeros after EOF of growable BDSes */
4049082c 3103 int64_t total_sectors, max_nb_sectors;
893a8f62 3104
4049082c
MA
3105 total_sectors = bdrv_nb_sectors(bs);
3106 if (total_sectors < 0) {
3107 ret = total_sectors;
893a8f62
MK
3108 goto out;
3109 }
3110
5f5bcd80
KW
3111 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3112 align >> BDRV_SECTOR_BITS);
893a8f62 3113 if (max_nb_sectors > 0) {
33f461e0
KW
3114 QEMUIOVector local_qiov;
3115 size_t local_sectors;
3116
3117 max_nb_sectors = MIN(max_nb_sectors, SIZE_MAX / BDRV_SECTOR_BITS);
3118 local_sectors = MIN(max_nb_sectors, nb_sectors);
3119
3120 qemu_iovec_init(&local_qiov, qiov->niov);
3121 qemu_iovec_concat(&local_qiov, qiov, 0,
3122 local_sectors * BDRV_SECTOR_SIZE);
3123
3124 ret = drv->bdrv_co_readv(bs, sector_num, local_sectors,
3125 &local_qiov);
3126
3127 qemu_iovec_destroy(&local_qiov);
893a8f62
MK
3128 } else {
3129 ret = 0;
3130 }
3131
3132 /* Reading beyond end of file is supposed to produce zeroes */
3133 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3134 uint64_t offset = MAX(0, total_sectors - sector_num);
3135 uint64_t bytes = (sector_num + nb_sectors - offset) *
3136 BDRV_SECTOR_SIZE;
3137 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3138 }
3139 }
ab185921
SH
3140
3141out:
dbffbdcf 3142 return ret;
da1fa91d
KW
3143}
3144
d0c7f642
KW
3145/*
3146 * Handle a read request in coroutine context
3147 */
1b0288ae
KW
3148static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3149 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
d0c7f642
KW
3150 BdrvRequestFlags flags)
3151{
3152 BlockDriver *drv = bs->drv;
65afd211
KW
3153 BdrvTrackedRequest req;
3154
1b0288ae
KW
3155 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3156 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3157 uint8_t *head_buf = NULL;
3158 uint8_t *tail_buf = NULL;
3159 QEMUIOVector local_qiov;
3160 bool use_local_qiov = false;
d0c7f642
KW
3161 int ret;
3162
3163 if (!drv) {
3164 return -ENOMEDIUM;
3165 }
1b0288ae 3166 if (bdrv_check_byte_request(bs, offset, bytes)) {
d0c7f642
KW
3167 return -EIO;
3168 }
3169
3170 if (bs->copy_on_read) {
3171 flags |= BDRV_REQ_COPY_ON_READ;
3172 }
3173
3174 /* throttling disk I/O */
3175 if (bs->io_limits_enabled) {
d5103588 3176 bdrv_io_limits_intercept(bs, bytes, false);
1b0288ae
KW
3177 }
3178
3179 /* Align read if necessary by padding qiov */
3180 if (offset & (align - 1)) {
3181 head_buf = qemu_blockalign(bs, align);
3182 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3183 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3184 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3185 use_local_qiov = true;
3186
3187 bytes += offset & (align - 1);
3188 offset = offset & ~(align - 1);
3189 }
3190
3191 if ((offset + bytes) & (align - 1)) {
3192 if (!use_local_qiov) {
3193 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3194 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3195 use_local_qiov = true;
3196 }
3197 tail_buf = qemu_blockalign(bs, align);
3198 qemu_iovec_add(&local_qiov, tail_buf,
3199 align - ((offset + bytes) & (align - 1)));
3200
3201 bytes = ROUND_UP(bytes, align);
3202 }
3203
65afd211 3204 tracked_request_begin(&req, bs, offset, bytes, false);
ec746e10 3205 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
1b0288ae
KW
3206 use_local_qiov ? &local_qiov : qiov,
3207 flags);
65afd211 3208 tracked_request_end(&req);
1b0288ae
KW
3209
3210 if (use_local_qiov) {
3211 qemu_iovec_destroy(&local_qiov);
3212 qemu_vfree(head_buf);
3213 qemu_vfree(tail_buf);
d0c7f642
KW
3214 }
3215
d0c7f642
KW
3216 return ret;
3217}
3218
1b0288ae
KW
3219static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3220 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3221 BdrvRequestFlags flags)
3222{
3223 if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3224 return -EINVAL;
3225 }
3226
3227 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3228 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3229}
3230
c5fbe571 3231int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
da1fa91d
KW
3232 int nb_sectors, QEMUIOVector *qiov)
3233{
c5fbe571 3234 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
da1fa91d 3235
470c0504
SH
3236 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3237}
3238
3239int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3240 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3241{
3242 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3243
3244 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3245 BDRV_REQ_COPY_ON_READ);
c5fbe571
SH
3246}
3247
c31cb707
PL
3248/* if no limit is specified in the BlockLimits use a default
3249 * of 32768 512-byte sectors (16 MiB) per request.
3250 */
3251#define MAX_WRITE_ZEROES_DEFAULT 32768
3252
f08f2dda 3253static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
aa7bfbff 3254 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
f08f2dda
SH
3255{
3256 BlockDriver *drv = bs->drv;
3257 QEMUIOVector qiov;
c31cb707
PL
3258 struct iovec iov = {0};
3259 int ret = 0;
f08f2dda 3260
c31cb707
PL
3261 int max_write_zeroes = bs->bl.max_write_zeroes ?
3262 bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
621f0589 3263
c31cb707
PL
3264 while (nb_sectors > 0 && !ret) {
3265 int num = nb_sectors;
3266
b8d71c09
PB
3267 /* Align request. Block drivers can expect the "bulk" of the request
3268 * to be aligned.
3269 */
3270 if (bs->bl.write_zeroes_alignment
3271 && num > bs->bl.write_zeroes_alignment) {
3272 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3273 /* Make a small request up to the first aligned sector. */
c31cb707 3274 num = bs->bl.write_zeroes_alignment;
b8d71c09
PB
3275 num -= sector_num % bs->bl.write_zeroes_alignment;
3276 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3277 /* Shorten the request to the last aligned sector. num cannot
3278 * underflow because num > bs->bl.write_zeroes_alignment.
3279 */
3280 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
c31cb707 3281 }
621f0589 3282 }
f08f2dda 3283
c31cb707
PL
3284 /* limit request size */
3285 if (num > max_write_zeroes) {
3286 num = max_write_zeroes;
3287 }
3288
3289 ret = -ENOTSUP;
3290 /* First try the efficient write zeroes operation */
3291 if (drv->bdrv_co_write_zeroes) {
3292 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3293 }
3294
3295 if (ret == -ENOTSUP) {
3296 /* Fall back to bounce buffer if write zeroes is unsupported */
3297 iov.iov_len = num * BDRV_SECTOR_SIZE;
3298 if (iov.iov_base == NULL) {
857d4f46
KW
3299 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3300 if (iov.iov_base == NULL) {
3301 ret = -ENOMEM;
3302 goto fail;
3303 }
b8d71c09 3304 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
c31cb707
PL
3305 }
3306 qemu_iovec_init_external(&qiov, &iov, 1);
f08f2dda 3307
c31cb707 3308 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
b8d71c09
PB
3309
3310 /* Keep bounce buffer around if it is big enough for all
3311 * all future requests.
3312 */
3313 if (num < max_write_zeroes) {
3314 qemu_vfree(iov.iov_base);
3315 iov.iov_base = NULL;
3316 }
c31cb707
PL
3317 }
3318
3319 sector_num += num;
3320 nb_sectors -= num;
3321 }
f08f2dda 3322
857d4f46 3323fail:
f08f2dda
SH
3324 qemu_vfree(iov.iov_base);
3325 return ret;
3326}
3327
c5fbe571 3328/*
b404f720 3329 * Forwards an already correctly aligned write request to the BlockDriver.
c5fbe571 3330 */
b404f720 3331static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
65afd211
KW
3332 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3333 QEMUIOVector *qiov, int flags)
c5fbe571
SH
3334{
3335 BlockDriver *drv = bs->drv;
28de2dcd 3336 bool waited;
6b7cb247 3337 int ret;
da1fa91d 3338
b404f720
KW
3339 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3340 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
f4658285 3341
b404f720
KW
3342 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3343 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
8eb029c2 3344 assert(!qiov || bytes == qiov->size);
cc0681c4 3345
28de2dcd
KW
3346 waited = wait_serialising_requests(req);
3347 assert(!waited || !req->serialising);
af91f9a7
KW
3348 assert(req->overlap_offset <= offset);
3349 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
244eadef 3350
65afd211 3351 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
d616b224 3352
465bee1d
PL
3353 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3354 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3355 qemu_iovec_is_zero(qiov)) {
3356 flags |= BDRV_REQ_ZERO_WRITE;
3357 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3358 flags |= BDRV_REQ_MAY_UNMAP;
3359 }
3360 }
3361
d616b224
SH
3362 if (ret < 0) {
3363 /* Do nothing, write notifier decided to fail this request */
3364 } else if (flags & BDRV_REQ_ZERO_WRITE) {
9e1cb96d 3365 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
aa7bfbff 3366 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
f08f2dda 3367 } else {
9e1cb96d 3368 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
f08f2dda
SH
3369 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3370 }
9e1cb96d 3371 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
6b7cb247 3372
f05fa4ad
PB
3373 if (ret == 0 && !bs->enable_write_cache) {
3374 ret = bdrv_co_flush(bs);
3375 }
3376
e4654d2d 3377 bdrv_set_dirty(bs, sector_num, nb_sectors);
da1fa91d 3378
5366d0c8 3379 block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
5e5a94b6 3380
df2a6f29
PB
3381 if (bs->growable && ret >= 0) {
3382 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3383 }
da1fa91d 3384
6b7cb247 3385 return ret;
da1fa91d
KW
3386}
3387
b404f720
KW
3388/*
3389 * Handle a write request in coroutine context
3390 */
6601553e
KW
3391static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3392 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
b404f720
KW
3393 BdrvRequestFlags flags)
3394{
65afd211 3395 BdrvTrackedRequest req;
3b8242e0
KW
3396 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3397 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3398 uint8_t *head_buf = NULL;
3399 uint8_t *tail_buf = NULL;
3400 QEMUIOVector local_qiov;
3401 bool use_local_qiov = false;
b404f720
KW
3402 int ret;
3403
3404 if (!bs->drv) {
3405 return -ENOMEDIUM;
3406 }
3407 if (bs->read_only) {
3408 return -EACCES;
3409 }
6601553e 3410 if (bdrv_check_byte_request(bs, offset, bytes)) {
b404f720
KW
3411 return -EIO;
3412 }
3413
b404f720
KW
3414 /* throttling disk I/O */
3415 if (bs->io_limits_enabled) {
d5103588 3416 bdrv_io_limits_intercept(bs, bytes, true);
b404f720
KW
3417 }
3418
3b8242e0
KW
3419 /*
3420 * Align write if necessary by performing a read-modify-write cycle.
3421 * Pad qiov with the read parts and be sure to have a tracked request not
3422 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3423 */
65afd211 3424 tracked_request_begin(&req, bs, offset, bytes, true);
3b8242e0
KW
3425
3426 if (offset & (align - 1)) {
3427 QEMUIOVector head_qiov;
3428 struct iovec head_iov;
3429
3430 mark_request_serialising(&req, align);
3431 wait_serialising_requests(&req);
3432
3433 head_buf = qemu_blockalign(bs, align);
3434 head_iov = (struct iovec) {
3435 .iov_base = head_buf,
3436 .iov_len = align,
3437 };
3438 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3439
9e1cb96d 3440 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3b8242e0
KW
3441 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3442 align, &head_qiov, 0);
3443 if (ret < 0) {
3444 goto fail;
3445 }
9e1cb96d 3446 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3b8242e0
KW
3447
3448 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3449 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3450 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3451 use_local_qiov = true;
3452
3453 bytes += offset & (align - 1);
3454 offset = offset & ~(align - 1);
3455 }
3456
3457 if ((offset + bytes) & (align - 1)) {
3458 QEMUIOVector tail_qiov;
3459 struct iovec tail_iov;
3460 size_t tail_bytes;
28de2dcd 3461 bool waited;
3b8242e0
KW
3462
3463 mark_request_serialising(&req, align);
28de2dcd
KW
3464 waited = wait_serialising_requests(&req);
3465 assert(!waited || !use_local_qiov);
3b8242e0
KW
3466
3467 tail_buf = qemu_blockalign(bs, align);
3468 tail_iov = (struct iovec) {
3469 .iov_base = tail_buf,
3470 .iov_len = align,
3471 };
3472 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3473
9e1cb96d 3474 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3b8242e0
KW
3475 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3476 align, &tail_qiov, 0);
3477 if (ret < 0) {
3478 goto fail;
3479 }
9e1cb96d 3480 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3b8242e0
KW
3481
3482 if (!use_local_qiov) {
3483 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3484 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3485 use_local_qiov = true;
3486 }
3487
3488 tail_bytes = (offset + bytes) & (align - 1);
3489 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3490
3491 bytes = ROUND_UP(bytes, align);
3492 }
3493
3494 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3495 use_local_qiov ? &local_qiov : qiov,
3496 flags);
3497
3498fail:
65afd211 3499 tracked_request_end(&req);
b404f720 3500
3b8242e0
KW
3501 if (use_local_qiov) {
3502 qemu_iovec_destroy(&local_qiov);
3b8242e0 3503 }
99c4a85c
KW
3504 qemu_vfree(head_buf);
3505 qemu_vfree(tail_buf);
3b8242e0 3506
b404f720
KW
3507 return ret;
3508}
3509
6601553e
KW
3510static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3511 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3512 BdrvRequestFlags flags)
3513{
3514 if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3515 return -EINVAL;
3516 }
3517
3518 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3519 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3520}
3521
c5fbe571
SH
3522int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3523 int nb_sectors, QEMUIOVector *qiov)
3524{
3525 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3526
f08f2dda
SH
3527 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3528}
3529
3530int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
aa7bfbff
PL
3531 int64_t sector_num, int nb_sectors,
3532 BdrvRequestFlags flags)
f08f2dda 3533{
94d6ff21 3534 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
f08f2dda 3535
d32f35cb
PL
3536 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3537 flags &= ~BDRV_REQ_MAY_UNMAP;
3538 }
3539
f08f2dda 3540 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
aa7bfbff 3541 BDRV_REQ_ZERO_WRITE | flags);
c5fbe571
SH
3542}
3543
83f64091
FB
3544/**
3545 * Truncate file to 'offset' bytes (needed only for file protocols)
3546 */
3547int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3548{
3549 BlockDriver *drv = bs->drv;
51762288 3550 int ret;
83f64091 3551 if (!drv)
19cb3738 3552 return -ENOMEDIUM;
83f64091
FB
3553 if (!drv->bdrv_truncate)
3554 return -ENOTSUP;
59f2689d
NS
3555 if (bs->read_only)
3556 return -EACCES;
9c75e168 3557
51762288
SH
3558 ret = drv->bdrv_truncate(bs, offset);
3559 if (ret == 0) {
3560 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
145feb17 3561 bdrv_dev_resize_cb(bs);
51762288
SH
3562 }
3563 return ret;
83f64091
FB
3564}
3565
4a1d5e1f
FZ
3566/**
3567 * Length of a allocated file in bytes. Sparse files are counted by actual
3568 * allocated space. Return < 0 if error or unknown.
3569 */
3570int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3571{
3572 BlockDriver *drv = bs->drv;
3573 if (!drv) {
3574 return -ENOMEDIUM;
3575 }
3576 if (drv->bdrv_get_allocated_file_size) {
3577 return drv->bdrv_get_allocated_file_size(bs);
3578 }
3579 if (bs->file) {
3580 return bdrv_get_allocated_file_size(bs->file);
3581 }
3582 return -ENOTSUP;
3583}
3584
83f64091 3585/**
65a9bb25 3586 * Return number of sectors on success, -errno on error.
83f64091 3587 */
65a9bb25 3588int64_t bdrv_nb_sectors(BlockDriverState *bs)
83f64091
FB
3589{
3590 BlockDriver *drv = bs->drv;
65a9bb25 3591
83f64091 3592 if (!drv)
19cb3738 3593 return -ENOMEDIUM;
51762288 3594
b94a2610
KW
3595 if (drv->has_variable_length) {
3596 int ret = refresh_total_sectors(bs, bs->total_sectors);
3597 if (ret < 0) {
3598 return ret;
46a4e4e6 3599 }
83f64091 3600 }
65a9bb25
MA
3601 return bs->total_sectors;
3602}
3603
3604/**
3605 * Return length in bytes on success, -errno on error.
3606 * The length is always a multiple of BDRV_SECTOR_SIZE.
3607 */
3608int64_t bdrv_getlength(BlockDriverState *bs)
3609{
3610 int64_t ret = bdrv_nb_sectors(bs);
3611
3612 return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
fc01f7e7
FB
3613}
3614
19cb3738 3615/* return 0 as number of sectors if no device present or error */
96b8f136 3616void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
fc01f7e7 3617{
65a9bb25
MA
3618 int64_t nb_sectors = bdrv_nb_sectors(bs);
3619
3620 *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
fc01f7e7 3621}
cf98951b 3622
ff06f5f3
PB
3623void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3624 BlockdevOnError on_write_error)
abd7f68d
MA
3625{
3626 bs->on_read_error = on_read_error;
3627 bs->on_write_error = on_write_error;
3628}
3629
1ceee0d5 3630BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
abd7f68d
MA
3631{
3632 return is_read ? bs->on_read_error : bs->on_write_error;
3633}
3634
3e1caa5f
PB
3635BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3636{
3637 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3638
3639 switch (on_err) {
3640 case BLOCKDEV_ON_ERROR_ENOSPC:
a589569f
WX
3641 return (error == ENOSPC) ?
3642 BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3e1caa5f 3643 case BLOCKDEV_ON_ERROR_STOP:
a589569f 3644 return BLOCK_ERROR_ACTION_STOP;
3e1caa5f 3645 case BLOCKDEV_ON_ERROR_REPORT:
a589569f 3646 return BLOCK_ERROR_ACTION_REPORT;
3e1caa5f 3647 case BLOCKDEV_ON_ERROR_IGNORE:
a589569f 3648 return BLOCK_ERROR_ACTION_IGNORE;
3e1caa5f
PB
3649 default:
3650 abort();
3651 }
3652}
3653
c7c2ff0c
LC
3654static void send_qmp_error_event(BlockDriverState *bs,
3655 BlockErrorAction action,
3656 bool is_read, int error)
3657{
3658 BlockErrorAction ac;
3659
3660 ac = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
3661 qapi_event_send_block_io_error(bdrv_get_device_name(bs), ac, action,
3662 bdrv_iostatus_is_enabled(bs),
624ff573
LC
3663 error == ENOSPC, strerror(error),
3664 &error_abort);
c7c2ff0c
LC
3665}
3666
3e1caa5f
PB
3667/* This is done by device models because, while the block layer knows
3668 * about the error, it does not know whether an operation comes from
3669 * the device or the block layer (from a job, for example).
3670 */
3671void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3672 bool is_read, int error)
3673{
3674 assert(error >= 0);
2bd3bce8 3675
a589569f 3676 if (action == BLOCK_ERROR_ACTION_STOP) {
2bd3bce8
PB
3677 /* First set the iostatus, so that "info block" returns an iostatus
3678 * that matches the events raised so far (an additional error iostatus
3679 * is fine, but not a lost one).
3680 */
3e1caa5f 3681 bdrv_iostatus_set_err(bs, error);
2bd3bce8
PB
3682
3683 /* Then raise the request to stop the VM and the event.
3684 * qemu_system_vmstop_request_prepare has two effects. First,
3685 * it ensures that the STOP event always comes after the
3686 * BLOCK_IO_ERROR event. Second, it ensures that even if management
3687 * can observe the STOP event and do a "cont" before the STOP
3688 * event is issued, the VM will not stop. In this case, vm_start()
3689 * also ensures that the STOP/RESUME pair of events is emitted.
3690 */
3691 qemu_system_vmstop_request_prepare();
c7c2ff0c 3692 send_qmp_error_event(bs, action, is_read, error);
2bd3bce8
PB
3693 qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3694 } else {
c7c2ff0c 3695 send_qmp_error_event(bs, action, is_read, error);
3e1caa5f
PB
3696 }
3697}
3698
b338082b
FB
3699int bdrv_is_read_only(BlockDriverState *bs)
3700{
3701 return bs->read_only;
3702}
3703
985a03b0
TS
3704int bdrv_is_sg(BlockDriverState *bs)
3705{
3706 return bs->sg;
3707}
3708
e900a7b7
CH
3709int bdrv_enable_write_cache(BlockDriverState *bs)
3710{
3711 return bs->enable_write_cache;
3712}
3713
425b0148
PB
3714void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3715{
3716 bs->enable_write_cache = wce;
55b110f2
JC
3717
3718 /* so a reopen() will preserve wce */
3719 if (wce) {
3720 bs->open_flags |= BDRV_O_CACHE_WB;
3721 } else {
3722 bs->open_flags &= ~BDRV_O_CACHE_WB;
3723 }
425b0148
PB
3724}
3725
ea2384d3
FB
3726int bdrv_is_encrypted(BlockDriverState *bs)
3727{
3728 if (bs->backing_hd && bs->backing_hd->encrypted)
3729 return 1;
3730 return bs->encrypted;
3731}
3732
c0f4ce77
AL
3733int bdrv_key_required(BlockDriverState *bs)
3734{
3735 BlockDriverState *backing_hd = bs->backing_hd;
3736
3737 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3738 return 1;
3739 return (bs->encrypted && !bs->valid_key);
3740}
3741
ea2384d3
FB
3742int bdrv_set_key(BlockDriverState *bs, const char *key)
3743{
3744 int ret;
3745 if (bs->backing_hd && bs->backing_hd->encrypted) {
3746 ret = bdrv_set_key(bs->backing_hd, key);
3747 if (ret < 0)
3748 return ret;
3749 if (!bs->encrypted)
3750 return 0;
3751 }
fd04a2ae
SH
3752 if (!bs->encrypted) {
3753 return -EINVAL;
3754 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3755 return -ENOMEDIUM;
3756 }
c0f4ce77 3757 ret = bs->drv->bdrv_set_key(bs, key);
bb5fc20f
AL
3758 if (ret < 0) {
3759 bs->valid_key = 0;
3760 } else if (!bs->valid_key) {
3761 bs->valid_key = 1;
3762 /* call the change callback now, we skipped it on open */
7d4b4ba5 3763 bdrv_dev_change_media_cb(bs, true);
bb5fc20f 3764 }
c0f4ce77 3765 return ret;
ea2384d3
FB
3766}
3767
f8d6bba1 3768const char *bdrv_get_format_name(BlockDriverState *bs)
ea2384d3 3769{
f8d6bba1 3770 return bs->drv ? bs->drv->format_name : NULL;
ea2384d3
FB
3771}
3772
ada42401
SH
3773static int qsort_strcmp(const void *a, const void *b)
3774{
3775 return strcmp(a, b);
3776}
3777
5fafdf24 3778void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
ea2384d3
FB
3779 void *opaque)
3780{
3781 BlockDriver *drv;
e855e4fb 3782 int count = 0;
ada42401 3783 int i;
e855e4fb 3784 const char **formats = NULL;
ea2384d3 3785
8a22f02a 3786 QLIST_FOREACH(drv, &bdrv_drivers, list) {
e855e4fb
JC
3787 if (drv->format_name) {
3788 bool found = false;
3789 int i = count;
3790 while (formats && i && !found) {
3791 found = !strcmp(formats[--i], drv->format_name);
3792 }
3793
3794 if (!found) {
5839e53b 3795 formats = g_renew(const char *, formats, count + 1);
e855e4fb 3796 formats[count++] = drv->format_name;
e855e4fb
JC
3797 }
3798 }
ea2384d3 3799 }
ada42401
SH
3800
3801 qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3802
3803 for (i = 0; i < count; i++) {
3804 it(opaque, formats[i]);
3805 }
3806
e855e4fb 3807 g_free(formats);
ea2384d3
FB
3808}
3809
dc364f4c 3810/* This function is to find block backend bs */
b338082b
FB
3811BlockDriverState *bdrv_find(const char *name)
3812{
3813 BlockDriverState *bs;
3814
dc364f4c 3815 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1b7bdbc1 3816 if (!strcmp(name, bs->device_name)) {
b338082b 3817 return bs;
1b7bdbc1 3818 }
b338082b
FB
3819 }
3820 return NULL;
3821}
3822
dc364f4c
BC
3823/* This function is to find a node in the bs graph */
3824BlockDriverState *bdrv_find_node(const char *node_name)
3825{
3826 BlockDriverState *bs;
3827
3828 assert(node_name);
3829
3830 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3831 if (!strcmp(node_name, bs->node_name)) {
3832 return bs;
3833 }
3834 }
3835 return NULL;
3836}
3837
c13163fb
BC
3838/* Put this QMP function here so it can access the static graph_bdrv_states. */
3839BlockDeviceInfoList *bdrv_named_nodes_list(void)
3840{
3841 BlockDeviceInfoList *list, *entry;
3842 BlockDriverState *bs;
3843
3844 list = NULL;
3845 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3846 entry = g_malloc0(sizeof(*entry));
3847 entry->value = bdrv_block_device_info(bs);
3848 entry->next = list;
3849 list = entry;
3850 }
3851
3852 return list;
3853}
3854
12d3ba82
BC
3855BlockDriverState *bdrv_lookup_bs(const char *device,
3856 const char *node_name,
3857 Error **errp)
3858{
3859 BlockDriverState *bs = NULL;
3860
12d3ba82
BC
3861 if (device) {
3862 bs = bdrv_find(device);
3863
dd67fa50
BC
3864 if (bs) {
3865 return bs;
12d3ba82 3866 }
12d3ba82
BC
3867 }
3868
dd67fa50
BC
3869 if (node_name) {
3870 bs = bdrv_find_node(node_name);
12d3ba82 3871
dd67fa50
BC
3872 if (bs) {
3873 return bs;
3874 }
12d3ba82
BC
3875 }
3876
dd67fa50
BC
3877 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3878 device ? device : "",
3879 node_name ? node_name : "");
3880 return NULL;
12d3ba82
BC
3881}
3882
5a6684d2
JC
3883/* If 'base' is in the same chain as 'top', return true. Otherwise,
3884 * return false. If either argument is NULL, return false. */
3885bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3886{
3887 while (top && top != base) {
3888 top = top->backing_hd;
3889 }
3890
3891 return top != NULL;
3892}
3893
2f399b0a
MA
3894BlockDriverState *bdrv_next(BlockDriverState *bs)
3895{
3896 if (!bs) {
3897 return QTAILQ_FIRST(&bdrv_states);
3898 }
dc364f4c 3899 return QTAILQ_NEXT(bs, device_list);
2f399b0a
MA
3900}
3901
51de9760 3902void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
81d0912d
FB
3903{
3904 BlockDriverState *bs;
3905
dc364f4c 3906 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
51de9760 3907 it(opaque, bs);
81d0912d
FB
3908 }
3909}
3910
ea2384d3
FB
3911const char *bdrv_get_device_name(BlockDriverState *bs)
3912{
3913 return bs->device_name;
3914}
3915
c8433287
MA
3916int bdrv_get_flags(BlockDriverState *bs)
3917{
3918 return bs->open_flags;
3919}
3920
f0f0fdfe 3921int bdrv_flush_all(void)
c6ca28d6
AL
3922{
3923 BlockDriverState *bs;
f0f0fdfe 3924 int result = 0;
c6ca28d6 3925
dc364f4c 3926 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
3927 AioContext *aio_context = bdrv_get_aio_context(bs);
3928 int ret;
3929
3930 aio_context_acquire(aio_context);
3931 ret = bdrv_flush(bs);
f0f0fdfe
KW
3932 if (ret < 0 && !result) {
3933 result = ret;
3934 }
ed78cda3 3935 aio_context_release(aio_context);
1b7bdbc1 3936 }
f0f0fdfe
KW
3937
3938 return result;
c6ca28d6
AL
3939}
3940
3ac21627
PL
3941int bdrv_has_zero_init_1(BlockDriverState *bs)
3942{
3943 return 1;
3944}
3945
f2feebbd
KW
3946int bdrv_has_zero_init(BlockDriverState *bs)
3947{
3948 assert(bs->drv);
3949
11212d8f
PB
3950 /* If BS is a copy on write image, it is initialized to
3951 the contents of the base image, which may not be zeroes. */
3952 if (bs->backing_hd) {
3953 return 0;
3954 }
336c1c12
KW
3955 if (bs->drv->bdrv_has_zero_init) {
3956 return bs->drv->bdrv_has_zero_init(bs);
f2feebbd
KW
3957 }
3958
3ac21627
PL
3959 /* safe default */
3960 return 0;
f2feebbd
KW
3961}
3962
4ce78691
PL
3963bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3964{
3965 BlockDriverInfo bdi;
3966
3967 if (bs->backing_hd) {
3968 return false;
3969 }
3970
3971 if (bdrv_get_info(bs, &bdi) == 0) {
3972 return bdi.unallocated_blocks_are_zero;
3973 }
3974
3975 return false;
3976}
3977
3978bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3979{
3980 BlockDriverInfo bdi;
3981
3982 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3983 return false;
3984 }
3985
3986 if (bdrv_get_info(bs, &bdi) == 0) {
3987 return bdi.can_write_zeroes_with_unmap;
3988 }
3989
3990 return false;
3991}
3992
b6b8a333 3993typedef struct BdrvCoGetBlockStatusData {
376ae3f1 3994 BlockDriverState *bs;
b35b2bba 3995 BlockDriverState *base;
376ae3f1
SH
3996 int64_t sector_num;
3997 int nb_sectors;
3998 int *pnum;
b6b8a333 3999 int64_t ret;
376ae3f1 4000 bool done;
b6b8a333 4001} BdrvCoGetBlockStatusData;
376ae3f1 4002
f58c7b35
TS
4003/*
4004 * Returns true iff the specified sector is present in the disk image. Drivers
4005 * not implementing the functionality are assumed to not support backing files,
4006 * hence all their sectors are reported as allocated.
4007 *
bd9533e3
SH
4008 * If 'sector_num' is beyond the end of the disk image the return value is 0
4009 * and 'pnum' is set to 0.
4010 *
f58c7b35
TS
4011 * 'pnum' is set to the number of sectors (including and immediately following
4012 * the specified sector) that are known to be in the same
4013 * allocated/unallocated state.
4014 *
bd9533e3
SH
4015 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
4016 * beyond the end of the disk image it will be clamped.
f58c7b35 4017 */
b6b8a333
PB
4018static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
4019 int64_t sector_num,
4020 int nb_sectors, int *pnum)
f58c7b35 4021{
30a7f2fc 4022 int64_t total_sectors;
bd9533e3 4023 int64_t n;
5daa74a6 4024 int64_t ret, ret2;
bd9533e3 4025
30a7f2fc
MA
4026 total_sectors = bdrv_nb_sectors(bs);
4027 if (total_sectors < 0) {
4028 return total_sectors;
617ccb46
PB
4029 }
4030
30a7f2fc 4031 if (sector_num >= total_sectors) {
bd9533e3
SH
4032 *pnum = 0;
4033 return 0;
4034 }
4035
30a7f2fc 4036 n = total_sectors - sector_num;
bd9533e3
SH
4037 if (n < nb_sectors) {
4038 nb_sectors = n;
4039 }
4040
b6b8a333 4041 if (!bs->drv->bdrv_co_get_block_status) {
bd9533e3 4042 *pnum = nb_sectors;
e88ae226 4043 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
918e92d7
PB
4044 if (bs->drv->protocol_name) {
4045 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
4046 }
4047 return ret;
f58c7b35 4048 }
6aebab14 4049
415b5b01
PB
4050 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
4051 if (ret < 0) {
3e0a233d 4052 *pnum = 0;
415b5b01
PB
4053 return ret;
4054 }
4055
92bc50a5
PL
4056 if (ret & BDRV_BLOCK_RAW) {
4057 assert(ret & BDRV_BLOCK_OFFSET_VALID);
4058 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4059 *pnum, pnum);
4060 }
4061
e88ae226
KW
4062 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
4063 ret |= BDRV_BLOCK_ALLOCATED;
4064 }
4065
c3d86884
PL
4066 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
4067 if (bdrv_unallocated_blocks_are_zero(bs)) {
f0ad5712 4068 ret |= BDRV_BLOCK_ZERO;
1f9db224 4069 } else if (bs->backing_hd) {
f0ad5712 4070 BlockDriverState *bs2 = bs->backing_hd;
30a7f2fc
MA
4071 int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
4072 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
f0ad5712
PB
4073 ret |= BDRV_BLOCK_ZERO;
4074 }
4075 }
415b5b01 4076 }
5daa74a6
PB
4077
4078 if (bs->file &&
4079 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
4080 (ret & BDRV_BLOCK_OFFSET_VALID)) {
4081 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4082 *pnum, pnum);
4083 if (ret2 >= 0) {
4084 /* Ignore errors. This is just providing extra information, it
4085 * is useful but not necessary.
4086 */
4087 ret |= (ret2 & BDRV_BLOCK_ZERO);
4088 }
4089 }
4090
415b5b01 4091 return ret;
060f51c9
SH
4092}
4093
b6b8a333
PB
4094/* Coroutine wrapper for bdrv_get_block_status() */
4095static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
060f51c9 4096{
b6b8a333 4097 BdrvCoGetBlockStatusData *data = opaque;
060f51c9
SH
4098 BlockDriverState *bs = data->bs;
4099
b6b8a333
PB
4100 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4101 data->pnum);
060f51c9
SH
4102 data->done = true;
4103}
4104
4105/*
b6b8a333 4106 * Synchronous wrapper around bdrv_co_get_block_status().
060f51c9 4107 *
b6b8a333 4108 * See bdrv_co_get_block_status() for details.
060f51c9 4109 */
b6b8a333
PB
4110int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4111 int nb_sectors, int *pnum)
060f51c9 4112{
6aebab14 4113 Coroutine *co;
b6b8a333 4114 BdrvCoGetBlockStatusData data = {
6aebab14
SH
4115 .bs = bs,
4116 .sector_num = sector_num,
4117 .nb_sectors = nb_sectors,
4118 .pnum = pnum,
4119 .done = false,
4120 };
4121
bdad13b9
PB
4122 if (qemu_in_coroutine()) {
4123 /* Fast-path if already in coroutine context */
b6b8a333 4124 bdrv_get_block_status_co_entry(&data);
bdad13b9 4125 } else {
2572b37a
SH
4126 AioContext *aio_context = bdrv_get_aio_context(bs);
4127
b6b8a333 4128 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
bdad13b9
PB
4129 qemu_coroutine_enter(co, &data);
4130 while (!data.done) {
2572b37a 4131 aio_poll(aio_context, true);
bdad13b9 4132 }
6aebab14
SH
4133 }
4134 return data.ret;
f58c7b35
TS
4135}
4136
b6b8a333
PB
4137int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4138 int nb_sectors, int *pnum)
4139{
4333bb71
PB
4140 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4141 if (ret < 0) {
4142 return ret;
4143 }
01fb2705 4144 return !!(ret & BDRV_BLOCK_ALLOCATED);
b6b8a333
PB
4145}
4146
188a7bbf
PB
4147/*
4148 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4149 *
4150 * Return true if the given sector is allocated in any image between
4151 * BASE and TOP (inclusive). BASE can be NULL to check if the given
4152 * sector is allocated in any image of the chain. Return false otherwise.
4153 *
4154 * 'pnum' is set to the number of sectors (including and immediately following
4155 * the specified sector) that are known to be in the same
4156 * allocated/unallocated state.
4157 *
4158 */
4f578637
PB
4159int bdrv_is_allocated_above(BlockDriverState *top,
4160 BlockDriverState *base,
4161 int64_t sector_num,
4162 int nb_sectors, int *pnum)
188a7bbf
PB
4163{
4164 BlockDriverState *intermediate;
4165 int ret, n = nb_sectors;
4166
4167 intermediate = top;
4168 while (intermediate && intermediate != base) {
4169 int pnum_inter;
bdad13b9
PB
4170 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4171 &pnum_inter);
188a7bbf
PB
4172 if (ret < 0) {
4173 return ret;
4174 } else if (ret) {
4175 *pnum = pnum_inter;
4176 return 1;
4177 }
4178
4179 /*
4180 * [sector_num, nb_sectors] is unallocated on top but intermediate
4181 * might have
4182 *
4183 * [sector_num+x, nr_sectors] allocated.
4184 */
63ba17d3
VI
4185 if (n > pnum_inter &&
4186 (intermediate == top ||
4187 sector_num + pnum_inter < intermediate->total_sectors)) {
188a7bbf
PB
4188 n = pnum_inter;
4189 }
4190
4191 intermediate = intermediate->backing_hd;
4192 }
4193
4194 *pnum = n;
4195 return 0;
4196}
4197
045df330
AL
4198const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4199{
4200 if (bs->backing_hd && bs->backing_hd->encrypted)
4201 return bs->backing_file;
4202 else if (bs->encrypted)
4203 return bs->filename;
4204 else
4205 return NULL;
4206}
4207
5fafdf24 4208void bdrv_get_backing_filename(BlockDriverState *bs,
83f64091
FB
4209 char *filename, int filename_size)
4210{
3574c608 4211 pstrcpy(filename, filename_size, bs->backing_file);
83f64091
FB
4212}
4213
5fafdf24 4214int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
faea38e7
FB
4215 const uint8_t *buf, int nb_sectors)
4216{
4217 BlockDriver *drv = bs->drv;
4218 if (!drv)
19cb3738 4219 return -ENOMEDIUM;
faea38e7
FB
4220 if (!drv->bdrv_write_compressed)
4221 return -ENOTSUP;
fbb7b4e0
KW
4222 if (bdrv_check_request(bs, sector_num, nb_sectors))
4223 return -EIO;
a55eb92c 4224
e4654d2d 4225 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
a55eb92c 4226
faea38e7
FB
4227 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4228}
3b46e624 4229
faea38e7
FB
4230int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4231{
4232 BlockDriver *drv = bs->drv;
4233 if (!drv)
19cb3738 4234 return -ENOMEDIUM;
faea38e7
FB
4235 if (!drv->bdrv_get_info)
4236 return -ENOTSUP;
4237 memset(bdi, 0, sizeof(*bdi));
4238 return drv->bdrv_get_info(bs, bdi);
4239}
4240
eae041fe
HR
4241ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4242{
4243 BlockDriver *drv = bs->drv;
4244 if (drv && drv->bdrv_get_specific_info) {
4245 return drv->bdrv_get_specific_info(bs);
4246 }
4247 return NULL;
4248}
4249
45566e9c
CH
4250int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4251 int64_t pos, int size)
cf8074b3
KW
4252{
4253 QEMUIOVector qiov;
4254 struct iovec iov = {
4255 .iov_base = (void *) buf,
4256 .iov_len = size,
4257 };
4258
4259 qemu_iovec_init_external(&qiov, &iov, 1);
4260 return bdrv_writev_vmstate(bs, &qiov, pos);
4261}
4262
4263int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
178e08a5
AL
4264{
4265 BlockDriver *drv = bs->drv;
cf8074b3
KW
4266
4267 if (!drv) {
178e08a5 4268 return -ENOMEDIUM;
cf8074b3
KW
4269 } else if (drv->bdrv_save_vmstate) {
4270 return drv->bdrv_save_vmstate(bs, qiov, pos);
4271 } else if (bs->file) {
4272 return bdrv_writev_vmstate(bs->file, qiov, pos);
4273 }
4274
7cdb1f6d 4275 return -ENOTSUP;
178e08a5
AL
4276}
4277
45566e9c
CH
4278int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4279 int64_t pos, int size)
178e08a5
AL
4280{
4281 BlockDriver *drv = bs->drv;
4282 if (!drv)
4283 return -ENOMEDIUM;
7cdb1f6d
MK
4284 if (drv->bdrv_load_vmstate)
4285 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4286 if (bs->file)
4287 return bdrv_load_vmstate(bs->file, buf, pos, size);
4288 return -ENOTSUP;
178e08a5
AL
4289}
4290
8b9b0cc2
KW
4291void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4292{
bf736fe3 4293 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
8b9b0cc2
KW
4294 return;
4295 }
4296
bf736fe3 4297 bs->drv->bdrv_debug_event(bs, event);
41c695c7
KW
4298}
4299
4300int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4301 const char *tag)
4302{
4303 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4304 bs = bs->file;
4305 }
4306
4307 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4308 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4309 }
4310
4311 return -ENOTSUP;
4312}
4313
4cc70e93
FZ
4314int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4315{
4316 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4317 bs = bs->file;
4318 }
4319
4320 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4321 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4322 }
4323
4324 return -ENOTSUP;
4325}
4326
41c695c7
KW
4327int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4328{
938789ea 4329 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
41c695c7
KW
4330 bs = bs->file;
4331 }
8b9b0cc2 4332
41c695c7
KW
4333 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4334 return bs->drv->bdrv_debug_resume(bs, tag);
4335 }
4336
4337 return -ENOTSUP;
4338}
4339
4340bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4341{
4342 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4343 bs = bs->file;
4344 }
4345
4346 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4347 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4348 }
4349
4350 return false;
8b9b0cc2
KW
4351}
4352
199630b6
BS
4353int bdrv_is_snapshot(BlockDriverState *bs)
4354{
4355 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4356}
4357
b1b1d783
JC
4358/* backing_file can either be relative, or absolute, or a protocol. If it is
4359 * relative, it must be relative to the chain. So, passing in bs->filename
4360 * from a BDS as backing_file should not be done, as that may be relative to
4361 * the CWD rather than the chain. */
e8a6bb9c
MT
4362BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4363 const char *backing_file)
4364{
b1b1d783
JC
4365 char *filename_full = NULL;
4366 char *backing_file_full = NULL;
4367 char *filename_tmp = NULL;
4368 int is_protocol = 0;
4369 BlockDriverState *curr_bs = NULL;
4370 BlockDriverState *retval = NULL;
4371
4372 if (!bs || !bs->drv || !backing_file) {
e8a6bb9c
MT
4373 return NULL;
4374 }
4375
b1b1d783
JC
4376 filename_full = g_malloc(PATH_MAX);
4377 backing_file_full = g_malloc(PATH_MAX);
4378 filename_tmp = g_malloc(PATH_MAX);
4379
4380 is_protocol = path_has_protocol(backing_file);
4381
4382 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4383
4384 /* If either of the filename paths is actually a protocol, then
4385 * compare unmodified paths; otherwise make paths relative */
4386 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4387 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4388 retval = curr_bs->backing_hd;
4389 break;
4390 }
e8a6bb9c 4391 } else {
b1b1d783
JC
4392 /* If not an absolute filename path, make it relative to the current
4393 * image's filename path */
4394 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4395 backing_file);
4396
4397 /* We are going to compare absolute pathnames */
4398 if (!realpath(filename_tmp, filename_full)) {
4399 continue;
4400 }
4401
4402 /* We need to make sure the backing filename we are comparing against
4403 * is relative to the current image filename (or absolute) */
4404 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4405 curr_bs->backing_file);
4406
4407 if (!realpath(filename_tmp, backing_file_full)) {
4408 continue;
4409 }
4410
4411 if (strcmp(backing_file_full, filename_full) == 0) {
4412 retval = curr_bs->backing_hd;
4413 break;
4414 }
e8a6bb9c
MT
4415 }
4416 }
4417
b1b1d783
JC
4418 g_free(filename_full);
4419 g_free(backing_file_full);
4420 g_free(filename_tmp);
4421 return retval;
e8a6bb9c
MT
4422}
4423
f198fd1c
BC
4424int bdrv_get_backing_file_depth(BlockDriverState *bs)
4425{
4426 if (!bs->drv) {
4427 return 0;
4428 }
4429
4430 if (!bs->backing_hd) {
4431 return 0;
4432 }
4433
4434 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4435}
4436
ea2384d3 4437/**************************************************************/
83f64091 4438/* async I/Os */
ea2384d3 4439
3b69e4b9 4440BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
f141eafe 4441 QEMUIOVector *qiov, int nb_sectors,
3b69e4b9 4442 BlockDriverCompletionFunc *cb, void *opaque)
83f64091 4443{
bbf0a440
SH
4444 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4445
d20d9b7c 4446 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
8c5873d6 4447 cb, opaque, false);
ea2384d3
FB
4448}
4449
f141eafe
AL
4450BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4451 QEMUIOVector *qiov, int nb_sectors,
4452 BlockDriverCompletionFunc *cb, void *opaque)
ea2384d3 4453{
bbf0a440
SH
4454 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4455
d20d9b7c 4456 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
8c5873d6 4457 cb, opaque, true);
83f64091
FB
4458}
4459
d5ef94d4
PB
4460BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4461 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4462 BlockDriverCompletionFunc *cb, void *opaque)
4463{
4464 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4465
4466 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4467 BDRV_REQ_ZERO_WRITE | flags,
4468 cb, opaque, true);
4469}
4470
40b4f539
KW
4471
4472typedef struct MultiwriteCB {
4473 int error;
4474 int num_requests;
4475 int num_callbacks;
4476 struct {
4477 BlockDriverCompletionFunc *cb;
4478 void *opaque;
4479 QEMUIOVector *free_qiov;
40b4f539
KW
4480 } callbacks[];
4481} MultiwriteCB;
4482
4483static void multiwrite_user_cb(MultiwriteCB *mcb)
4484{
4485 int i;
4486
4487 for (i = 0; i < mcb->num_callbacks; i++) {
4488 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
1e1ea48d
SH
4489 if (mcb->callbacks[i].free_qiov) {
4490 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4491 }
7267c094 4492 g_free(mcb->callbacks[i].free_qiov);
40b4f539
KW
4493 }
4494}
4495
4496static void multiwrite_cb(void *opaque, int ret)
4497{
4498 MultiwriteCB *mcb = opaque;
4499
6d519a5f
SH
4500 trace_multiwrite_cb(mcb, ret);
4501
cb6d3ca0 4502 if (ret < 0 && !mcb->error) {
40b4f539 4503 mcb->error = ret;
40b4f539
KW
4504 }
4505
4506 mcb->num_requests--;
4507 if (mcb->num_requests == 0) {
de189a1b 4508 multiwrite_user_cb(mcb);
7267c094 4509 g_free(mcb);
40b4f539
KW
4510 }
4511}
4512
4513static int multiwrite_req_compare(const void *a, const void *b)
4514{
77be4366
CH
4515 const BlockRequest *req1 = a, *req2 = b;
4516
4517 /*
4518 * Note that we can't simply subtract req2->sector from req1->sector
4519 * here as that could overflow the return value.
4520 */
4521 if (req1->sector > req2->sector) {
4522 return 1;
4523 } else if (req1->sector < req2->sector) {
4524 return -1;
4525 } else {
4526 return 0;
4527 }
40b4f539
KW
4528}
4529
4530/*
4531 * Takes a bunch of requests and tries to merge them. Returns the number of
4532 * requests that remain after merging.
4533 */
4534static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4535 int num_reqs, MultiwriteCB *mcb)
4536{
4537 int i, outidx;
4538
4539 // Sort requests by start sector
4540 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4541
4542 // Check if adjacent requests touch the same clusters. If so, combine them,
4543 // filling up gaps with zero sectors.
4544 outidx = 0;
4545 for (i = 1; i < num_reqs; i++) {
4546 int merge = 0;
4547 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4548
b6a127a1 4549 // Handle exactly sequential writes and overlapping writes.
40b4f539
KW
4550 if (reqs[i].sector <= oldreq_last) {
4551 merge = 1;
4552 }
4553
e2a305fb
CH
4554 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4555 merge = 0;
4556 }
4557
40b4f539
KW
4558 if (merge) {
4559 size_t size;
7267c094 4560 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
40b4f539
KW
4561 qemu_iovec_init(qiov,
4562 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4563
4564 // Add the first request to the merged one. If the requests are
4565 // overlapping, drop the last sectors of the first request.
4566 size = (reqs[i].sector - reqs[outidx].sector) << 9;
1b093c48 4567 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
40b4f539 4568
b6a127a1
PB
4569 // We should need to add any zeros between the two requests
4570 assert (reqs[i].sector <= oldreq_last);
40b4f539
KW
4571
4572 // Add the second request
1b093c48 4573 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
40b4f539 4574
391827eb
SH
4575 // Add tail of first request, if necessary
4576 if (qiov->size < reqs[outidx].qiov->size) {
4577 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4578 reqs[outidx].qiov->size - qiov->size);
4579 }
4580
cbf1dff2 4581 reqs[outidx].nb_sectors = qiov->size >> 9;
40b4f539
KW
4582 reqs[outidx].qiov = qiov;
4583
4584 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4585 } else {
4586 outidx++;
4587 reqs[outidx].sector = reqs[i].sector;
4588 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4589 reqs[outidx].qiov = reqs[i].qiov;
4590 }
4591 }
4592
4593 return outidx + 1;
4594}
4595
4596/*
4597 * Submit multiple AIO write requests at once.
4598 *
4599 * On success, the function returns 0 and all requests in the reqs array have
4600 * been submitted. In error case this function returns -1, and any of the
4601 * requests may or may not be submitted yet. In particular, this means that the
4602 * callback will be called for some of the requests, for others it won't. The
4603 * caller must check the error field of the BlockRequest to wait for the right
4604 * callbacks (if error != 0, no callback will be called).
4605 *
4606 * The implementation may modify the contents of the reqs array, e.g. to merge
4607 * requests. However, the fields opaque and error are left unmodified as they
4608 * are used to signal failure for a single request to the caller.
4609 */
4610int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4611{
40b4f539
KW
4612 MultiwriteCB *mcb;
4613 int i;
4614
301db7c2
RH
4615 /* don't submit writes if we don't have a medium */
4616 if (bs->drv == NULL) {
4617 for (i = 0; i < num_reqs; i++) {
4618 reqs[i].error = -ENOMEDIUM;
4619 }
4620 return -1;
4621 }
4622
40b4f539
KW
4623 if (num_reqs == 0) {
4624 return 0;
4625 }
4626
4627 // Create MultiwriteCB structure
7267c094 4628 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
40b4f539
KW
4629 mcb->num_requests = 0;
4630 mcb->num_callbacks = num_reqs;
4631
4632 for (i = 0; i < num_reqs; i++) {
4633 mcb->callbacks[i].cb = reqs[i].cb;
4634 mcb->callbacks[i].opaque = reqs[i].opaque;
4635 }
4636
4637 // Check for mergable requests
4638 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4639
6d519a5f
SH
4640 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4641
df9309fb
PB
4642 /* Run the aio requests. */
4643 mcb->num_requests = num_reqs;
40b4f539 4644 for (i = 0; i < num_reqs; i++) {
d20d9b7c
PB
4645 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4646 reqs[i].nb_sectors, reqs[i].flags,
4647 multiwrite_cb, mcb,
4648 true);
40b4f539
KW
4649 }
4650
4651 return 0;
40b4f539
KW
4652}
4653
83f64091 4654void bdrv_aio_cancel(BlockDriverAIOCB *acb)
83f64091 4655{
ca5fd113
FZ
4656 qemu_aio_ref(acb);
4657 bdrv_aio_cancel_async(acb);
4658 while (acb->refcnt > 1) {
4659 if (acb->aiocb_info->get_aio_context) {
4660 aio_poll(acb->aiocb_info->get_aio_context(acb), true);
4661 } else if (acb->bs) {
4662 aio_poll(bdrv_get_aio_context(acb->bs), true);
4663 } else {
4664 abort();
02c50efe 4665 }
02c50efe 4666 }
8007429a 4667 qemu_aio_unref(acb);
02c50efe
FZ
4668}
4669
4670/* Async version of aio cancel. The caller is not blocked if the acb implements
4671 * cancel_async, otherwise we do nothing and let the request normally complete.
4672 * In either case the completion callback must be called. */
4673void bdrv_aio_cancel_async(BlockDriverAIOCB *acb)
4674{
4675 if (acb->aiocb_info->cancel_async) {
4676 acb->aiocb_info->cancel_async(acb);
4677 }
83f64091
FB
4678}
4679
4680/**************************************************************/
4681/* async block device emulation */
4682
c16b5a2c
CH
4683typedef struct BlockDriverAIOCBSync {
4684 BlockDriverAIOCB common;
4685 QEMUBH *bh;
4686 int ret;
4687 /* vector translation state */
4688 QEMUIOVector *qiov;
4689 uint8_t *bounce;
4690 int is_write;
4691} BlockDriverAIOCBSync;
4692
d7331bed 4693static const AIOCBInfo bdrv_em_aiocb_info = {
c16b5a2c 4694 .aiocb_size = sizeof(BlockDriverAIOCBSync),
c16b5a2c
CH
4695};
4696
ce1a14dc 4697static void bdrv_aio_bh_cb(void *opaque)
83f64091 4698{
ce1a14dc 4699 BlockDriverAIOCBSync *acb = opaque;
f141eafe 4700
857d4f46 4701 if (!acb->is_write && acb->ret >= 0) {
03396148 4702 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
857d4f46 4703 }
ceb42de8 4704 qemu_vfree(acb->bounce);
ce1a14dc 4705 acb->common.cb(acb->common.opaque, acb->ret);
6a7ad299 4706 qemu_bh_delete(acb->bh);
36afc451 4707 acb->bh = NULL;
8007429a 4708 qemu_aio_unref(acb);
83f64091 4709}
beac80cd 4710
f141eafe
AL
4711static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4712 int64_t sector_num,
4713 QEMUIOVector *qiov,
4714 int nb_sectors,
4715 BlockDriverCompletionFunc *cb,
4716 void *opaque,
4717 int is_write)
4718
83f64091 4719{
ce1a14dc 4720 BlockDriverAIOCBSync *acb;
ce1a14dc 4721
d7331bed 4722 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
f141eafe
AL
4723 acb->is_write = is_write;
4724 acb->qiov = qiov;
857d4f46 4725 acb->bounce = qemu_try_blockalign(bs, qiov->size);
2572b37a 4726 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
f141eafe 4727
857d4f46
KW
4728 if (acb->bounce == NULL) {
4729 acb->ret = -ENOMEM;
4730 } else if (is_write) {
d5e6b161 4731 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
1ed20acf 4732 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
f141eafe 4733 } else {
1ed20acf 4734 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
f141eafe
AL
4735 }
4736
ce1a14dc 4737 qemu_bh_schedule(acb->bh);
f141eafe 4738
ce1a14dc 4739 return &acb->common;
beac80cd
FB
4740}
4741
f141eafe
AL
4742static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4743 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 4744 BlockDriverCompletionFunc *cb, void *opaque)
beac80cd 4745{
f141eafe
AL
4746 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4747}
83f64091 4748
f141eafe
AL
4749static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4750 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4751 BlockDriverCompletionFunc *cb, void *opaque)
4752{
4753 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
beac80cd 4754}
beac80cd 4755
68485420
KW
4756
4757typedef struct BlockDriverAIOCBCoroutine {
4758 BlockDriverAIOCB common;
4759 BlockRequest req;
4760 bool is_write;
d318aea9 4761 bool *done;
68485420
KW
4762 QEMUBH* bh;
4763} BlockDriverAIOCBCoroutine;
4764
d7331bed 4765static const AIOCBInfo bdrv_em_co_aiocb_info = {
68485420 4766 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
68485420
KW
4767};
4768
35246a68 4769static void bdrv_co_em_bh(void *opaque)
68485420
KW
4770{
4771 BlockDriverAIOCBCoroutine *acb = opaque;
4772
4773 acb->common.cb(acb->common.opaque, acb->req.error);
d318aea9 4774
68485420 4775 qemu_bh_delete(acb->bh);
8007429a 4776 qemu_aio_unref(acb);
68485420
KW
4777}
4778
b2a61371
SH
4779/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4780static void coroutine_fn bdrv_co_do_rw(void *opaque)
4781{
4782 BlockDriverAIOCBCoroutine *acb = opaque;
4783 BlockDriverState *bs = acb->common.bs;
4784
4785 if (!acb->is_write) {
4786 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
d20d9b7c 4787 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
b2a61371
SH
4788 } else {
4789 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
d20d9b7c 4790 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
b2a61371
SH
4791 }
4792
2572b37a 4793 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
b2a61371
SH
4794 qemu_bh_schedule(acb->bh);
4795}
4796
68485420
KW
4797static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4798 int64_t sector_num,
4799 QEMUIOVector *qiov,
4800 int nb_sectors,
d20d9b7c 4801 BdrvRequestFlags flags,
68485420
KW
4802 BlockDriverCompletionFunc *cb,
4803 void *opaque,
8c5873d6 4804 bool is_write)
68485420
KW
4805{
4806 Coroutine *co;
4807 BlockDriverAIOCBCoroutine *acb;
4808
d7331bed 4809 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
68485420
KW
4810 acb->req.sector = sector_num;
4811 acb->req.nb_sectors = nb_sectors;
4812 acb->req.qiov = qiov;
d20d9b7c 4813 acb->req.flags = flags;
68485420
KW
4814 acb->is_write = is_write;
4815
8c5873d6 4816 co = qemu_coroutine_create(bdrv_co_do_rw);
68485420
KW
4817 qemu_coroutine_enter(co, acb);
4818
4819 return &acb->common;
4820}
4821
07f07615 4822static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
b2e12bc6 4823{
07f07615
PB
4824 BlockDriverAIOCBCoroutine *acb = opaque;
4825 BlockDriverState *bs = acb->common.bs;
b2e12bc6 4826
07f07615 4827 acb->req.error = bdrv_co_flush(bs);
2572b37a 4828 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
b2e12bc6 4829 qemu_bh_schedule(acb->bh);
b2e12bc6
CH
4830}
4831
07f07615 4832BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
016f5cf6
AG
4833 BlockDriverCompletionFunc *cb, void *opaque)
4834{
07f07615 4835 trace_bdrv_aio_flush(bs, opaque);
016f5cf6 4836
07f07615
PB
4837 Coroutine *co;
4838 BlockDriverAIOCBCoroutine *acb;
016f5cf6 4839
d7331bed 4840 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
d318aea9 4841
07f07615
PB
4842 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4843 qemu_coroutine_enter(co, acb);
016f5cf6 4844
016f5cf6
AG
4845 return &acb->common;
4846}
4847
4265d620
PB
4848static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4849{
4850 BlockDriverAIOCBCoroutine *acb = opaque;
4851 BlockDriverState *bs = acb->common.bs;
4852
4853 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
2572b37a 4854 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4265d620
PB
4855 qemu_bh_schedule(acb->bh);
4856}
4857
4858BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4859 int64_t sector_num, int nb_sectors,
4860 BlockDriverCompletionFunc *cb, void *opaque)
4861{
4862 Coroutine *co;
4863 BlockDriverAIOCBCoroutine *acb;
4864
4865 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4866
d7331bed 4867 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4265d620
PB
4868 acb->req.sector = sector_num;
4869 acb->req.nb_sectors = nb_sectors;
4870 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4871 qemu_coroutine_enter(co, acb);
4872
4873 return &acb->common;
4874}
4875
ea2384d3
FB
4876void bdrv_init(void)
4877{
5efa9d5a 4878 module_call_init(MODULE_INIT_BLOCK);
ea2384d3 4879}
ce1a14dc 4880
eb852011
MA
4881void bdrv_init_with_whitelist(void)
4882{
4883 use_bdrv_whitelist = 1;
4884 bdrv_init();
4885}
4886
d7331bed 4887void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
c16b5a2c 4888 BlockDriverCompletionFunc *cb, void *opaque)
ce1a14dc 4889{
ce1a14dc
PB
4890 BlockDriverAIOCB *acb;
4891
d7331bed
SH
4892 acb = g_slice_alloc(aiocb_info->aiocb_size);
4893 acb->aiocb_info = aiocb_info;
ce1a14dc
PB
4894 acb->bs = bs;
4895 acb->cb = cb;
4896 acb->opaque = opaque;
f197fe2b 4897 acb->refcnt = 1;
ce1a14dc
PB
4898 return acb;
4899}
4900
f197fe2b
FZ
4901void qemu_aio_ref(void *p)
4902{
4903 BlockDriverAIOCB *acb = p;
4904 acb->refcnt++;
4905}
4906
8007429a 4907void qemu_aio_unref(void *p)
ce1a14dc 4908{
d37c975f 4909 BlockDriverAIOCB *acb = p;
f197fe2b
FZ
4910 assert(acb->refcnt > 0);
4911 if (--acb->refcnt == 0) {
4912 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4913 }
ce1a14dc 4914}
19cb3738 4915
f9f05dc5
KW
4916/**************************************************************/
4917/* Coroutine block device emulation */
4918
4919typedef struct CoroutineIOCompletion {
4920 Coroutine *coroutine;
4921 int ret;
4922} CoroutineIOCompletion;
4923
4924static void bdrv_co_io_em_complete(void *opaque, int ret)
4925{
4926 CoroutineIOCompletion *co = opaque;
4927
4928 co->ret = ret;
4929 qemu_coroutine_enter(co->coroutine, NULL);
4930}
4931
4932static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4933 int nb_sectors, QEMUIOVector *iov,
4934 bool is_write)
4935{
4936 CoroutineIOCompletion co = {
4937 .coroutine = qemu_coroutine_self(),
4938 };
4939 BlockDriverAIOCB *acb;
4940
4941 if (is_write) {
a652d160
SH
4942 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4943 bdrv_co_io_em_complete, &co);
f9f05dc5 4944 } else {
a652d160
SH
4945 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4946 bdrv_co_io_em_complete, &co);
f9f05dc5
KW
4947 }
4948
59370aaa 4949 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
f9f05dc5
KW
4950 if (!acb) {
4951 return -EIO;
4952 }
4953 qemu_coroutine_yield();
4954
4955 return co.ret;
4956}
4957
4958static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4959 int64_t sector_num, int nb_sectors,
4960 QEMUIOVector *iov)
4961{
4962 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4963}
4964
4965static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4966 int64_t sector_num, int nb_sectors,
4967 QEMUIOVector *iov)
4968{
4969 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4970}
4971
07f07615 4972static void coroutine_fn bdrv_flush_co_entry(void *opaque)
e7a8a783 4973{
07f07615
PB
4974 RwCo *rwco = opaque;
4975
4976 rwco->ret = bdrv_co_flush(rwco->bs);
4977}
4978
4979int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4980{
eb489bb1
KW
4981 int ret;
4982
29cdb251 4983 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
07f07615 4984 return 0;
eb489bb1
KW
4985 }
4986
ca716364 4987 /* Write back cached data to the OS even with cache=unsafe */
bf736fe3 4988 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
eb489bb1
KW
4989 if (bs->drv->bdrv_co_flush_to_os) {
4990 ret = bs->drv->bdrv_co_flush_to_os(bs);
4991 if (ret < 0) {
4992 return ret;
4993 }
4994 }
4995
ca716364
KW
4996 /* But don't actually force it to the disk with cache=unsafe */
4997 if (bs->open_flags & BDRV_O_NO_FLUSH) {
d4c82329 4998 goto flush_parent;
ca716364
KW
4999 }
5000
bf736fe3 5001 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
eb489bb1 5002 if (bs->drv->bdrv_co_flush_to_disk) {
29cdb251 5003 ret = bs->drv->bdrv_co_flush_to_disk(bs);
07f07615
PB
5004 } else if (bs->drv->bdrv_aio_flush) {
5005 BlockDriverAIOCB *acb;
5006 CoroutineIOCompletion co = {
5007 .coroutine = qemu_coroutine_self(),
5008 };
5009
5010 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
5011 if (acb == NULL) {
29cdb251 5012 ret = -EIO;
07f07615
PB
5013 } else {
5014 qemu_coroutine_yield();
29cdb251 5015 ret = co.ret;
07f07615 5016 }
07f07615
PB
5017 } else {
5018 /*
5019 * Some block drivers always operate in either writethrough or unsafe
5020 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
5021 * know how the server works (because the behaviour is hardcoded or
5022 * depends on server-side configuration), so we can't ensure that
5023 * everything is safe on disk. Returning an error doesn't work because
5024 * that would break guests even if the server operates in writethrough
5025 * mode.
5026 *
5027 * Let's hope the user knows what he's doing.
5028 */
29cdb251 5029 ret = 0;
07f07615 5030 }
29cdb251
PB
5031 if (ret < 0) {
5032 return ret;
5033 }
5034
5035 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
5036 * in the case of cache=unsafe, so there are no useless flushes.
5037 */
d4c82329 5038flush_parent:
29cdb251 5039 return bdrv_co_flush(bs->file);
07f07615
PB
5040}
5041
5a8a30db 5042void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
0f15423c 5043{
5a8a30db
KW
5044 Error *local_err = NULL;
5045 int ret;
5046
3456a8d1
KW
5047 if (!bs->drv) {
5048 return;
5049 }
5050
5051 if (bs->drv->bdrv_invalidate_cache) {
5a8a30db 5052 bs->drv->bdrv_invalidate_cache(bs, &local_err);
3456a8d1 5053 } else if (bs->file) {
5a8a30db
KW
5054 bdrv_invalidate_cache(bs->file, &local_err);
5055 }
5056 if (local_err) {
5057 error_propagate(errp, local_err);
5058 return;
0f15423c 5059 }
3456a8d1 5060
5a8a30db
KW
5061 ret = refresh_total_sectors(bs, bs->total_sectors);
5062 if (ret < 0) {
5063 error_setg_errno(errp, -ret, "Could not refresh total sector count");
5064 return;
5065 }
0f15423c
AL
5066}
5067
5a8a30db 5068void bdrv_invalidate_cache_all(Error **errp)
0f15423c
AL
5069{
5070 BlockDriverState *bs;
5a8a30db 5071 Error *local_err = NULL;
0f15423c 5072
dc364f4c 5073 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
5074 AioContext *aio_context = bdrv_get_aio_context(bs);
5075
5076 aio_context_acquire(aio_context);
5a8a30db 5077 bdrv_invalidate_cache(bs, &local_err);
ed78cda3 5078 aio_context_release(aio_context);
5a8a30db
KW
5079 if (local_err) {
5080 error_propagate(errp, local_err);
5081 return;
5082 }
0f15423c
AL
5083 }
5084}
5085
07789269
BC
5086void bdrv_clear_incoming_migration_all(void)
5087{
5088 BlockDriverState *bs;
5089
dc364f4c 5090 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
5091 AioContext *aio_context = bdrv_get_aio_context(bs);
5092
5093 aio_context_acquire(aio_context);
07789269 5094 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
ed78cda3 5095 aio_context_release(aio_context);
07789269
BC
5096 }
5097}
5098
07f07615
PB
5099int bdrv_flush(BlockDriverState *bs)
5100{
5101 Coroutine *co;
5102 RwCo rwco = {
5103 .bs = bs,
5104 .ret = NOT_DONE,
e7a8a783 5105 };
e7a8a783 5106
07f07615
PB
5107 if (qemu_in_coroutine()) {
5108 /* Fast-path if already in coroutine context */
5109 bdrv_flush_co_entry(&rwco);
5110 } else {
2572b37a
SH
5111 AioContext *aio_context = bdrv_get_aio_context(bs);
5112
07f07615
PB
5113 co = qemu_coroutine_create(bdrv_flush_co_entry);
5114 qemu_coroutine_enter(co, &rwco);
5115 while (rwco.ret == NOT_DONE) {
2572b37a 5116 aio_poll(aio_context, true);
07f07615 5117 }
e7a8a783 5118 }
07f07615
PB
5119
5120 return rwco.ret;
e7a8a783
KW
5121}
5122
775aa8b6
KW
5123typedef struct DiscardCo {
5124 BlockDriverState *bs;
5125 int64_t sector_num;
5126 int nb_sectors;
5127 int ret;
5128} DiscardCo;
4265d620
PB
5129static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5130{
775aa8b6 5131 DiscardCo *rwco = opaque;
4265d620
PB
5132
5133 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5134}
5135
6f14da52
PL
5136/* if no limit is specified in the BlockLimits use a default
5137 * of 32768 512-byte sectors (16 MiB) per request.
5138 */
5139#define MAX_DISCARD_DEFAULT 32768
5140
4265d620
PB
5141int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5142 int nb_sectors)
5143{
d51e9fe5
PB
5144 int max_discard;
5145
4265d620
PB
5146 if (!bs->drv) {
5147 return -ENOMEDIUM;
5148 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5149 return -EIO;
5150 } else if (bs->read_only) {
5151 return -EROFS;
df702c9b
PB
5152 }
5153
e4654d2d 5154 bdrv_reset_dirty(bs, sector_num, nb_sectors);
df702c9b 5155
9e8f1835
PB
5156 /* Do nothing if disabled. */
5157 if (!(bs->open_flags & BDRV_O_UNMAP)) {
5158 return 0;
5159 }
5160
d51e9fe5
PB
5161 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5162 return 0;
5163 }
6f14da52 5164
d51e9fe5
PB
5165 max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5166 while (nb_sectors > 0) {
5167 int ret;
5168 int num = nb_sectors;
6f14da52 5169
d51e9fe5
PB
5170 /* align request */
5171 if (bs->bl.discard_alignment &&
5172 num >= bs->bl.discard_alignment &&
5173 sector_num % bs->bl.discard_alignment) {
5174 if (num > bs->bl.discard_alignment) {
5175 num = bs->bl.discard_alignment;
6f14da52 5176 }
d51e9fe5
PB
5177 num -= sector_num % bs->bl.discard_alignment;
5178 }
6f14da52 5179
d51e9fe5
PB
5180 /* limit request size */
5181 if (num > max_discard) {
5182 num = max_discard;
5183 }
6f14da52 5184
d51e9fe5 5185 if (bs->drv->bdrv_co_discard) {
6f14da52 5186 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
d51e9fe5
PB
5187 } else {
5188 BlockDriverAIOCB *acb;
5189 CoroutineIOCompletion co = {
5190 .coroutine = qemu_coroutine_self(),
5191 };
5192
5193 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5194 bdrv_co_io_em_complete, &co);
5195 if (acb == NULL) {
5196 return -EIO;
5197 } else {
5198 qemu_coroutine_yield();
5199 ret = co.ret;
6f14da52 5200 }
6f14da52 5201 }
7ce21016 5202 if (ret && ret != -ENOTSUP) {
d51e9fe5 5203 return ret;
4265d620 5204 }
d51e9fe5
PB
5205
5206 sector_num += num;
5207 nb_sectors -= num;
4265d620 5208 }
d51e9fe5 5209 return 0;
4265d620
PB
5210}
5211
5212int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5213{
5214 Coroutine *co;
775aa8b6 5215 DiscardCo rwco = {
4265d620
PB
5216 .bs = bs,
5217 .sector_num = sector_num,
5218 .nb_sectors = nb_sectors,
5219 .ret = NOT_DONE,
5220 };
5221
5222 if (qemu_in_coroutine()) {
5223 /* Fast-path if already in coroutine context */
5224 bdrv_discard_co_entry(&rwco);
5225 } else {
2572b37a
SH
5226 AioContext *aio_context = bdrv_get_aio_context(bs);
5227
4265d620
PB
5228 co = qemu_coroutine_create(bdrv_discard_co_entry);
5229 qemu_coroutine_enter(co, &rwco);
5230 while (rwco.ret == NOT_DONE) {
2572b37a 5231 aio_poll(aio_context, true);
4265d620
PB
5232 }
5233 }
5234
5235 return rwco.ret;
5236}
5237
19cb3738
FB
5238/**************************************************************/
5239/* removable device support */
5240
5241/**
5242 * Return TRUE if the media is present
5243 */
5244int bdrv_is_inserted(BlockDriverState *bs)
5245{
5246 BlockDriver *drv = bs->drv;
a1aff5bf 5247
19cb3738
FB
5248 if (!drv)
5249 return 0;
5250 if (!drv->bdrv_is_inserted)
a1aff5bf
MA
5251 return 1;
5252 return drv->bdrv_is_inserted(bs);
19cb3738
FB
5253}
5254
5255/**
8e49ca46
MA
5256 * Return whether the media changed since the last call to this
5257 * function, or -ENOTSUP if we don't know. Most drivers don't know.
19cb3738
FB
5258 */
5259int bdrv_media_changed(BlockDriverState *bs)
5260{
5261 BlockDriver *drv = bs->drv;
19cb3738 5262
8e49ca46
MA
5263 if (drv && drv->bdrv_media_changed) {
5264 return drv->bdrv_media_changed(bs);
5265 }
5266 return -ENOTSUP;
19cb3738
FB
5267}
5268
5269/**
5270 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5271 */
f36f3949 5272void bdrv_eject(BlockDriverState *bs, bool eject_flag)
19cb3738
FB
5273{
5274 BlockDriver *drv = bs->drv;
19cb3738 5275
822e1cd1
MA
5276 if (drv && drv->bdrv_eject) {
5277 drv->bdrv_eject(bs, eject_flag);
19cb3738 5278 }
6f382ed2
LC
5279
5280 if (bs->device_name[0] != '\0') {
a5ee7bd4
WX
5281 qapi_event_send_device_tray_moved(bdrv_get_device_name(bs),
5282 eject_flag, &error_abort);
6f382ed2 5283 }
19cb3738
FB
5284}
5285
19cb3738
FB
5286/**
5287 * Lock or unlock the media (if it is locked, the user won't be able
5288 * to eject it manually).
5289 */
025e849a 5290void bdrv_lock_medium(BlockDriverState *bs, bool locked)
19cb3738
FB
5291{
5292 BlockDriver *drv = bs->drv;
5293
025e849a 5294 trace_bdrv_lock_medium(bs, locked);
b8c6d095 5295
025e849a
MA
5296 if (drv && drv->bdrv_lock_medium) {
5297 drv->bdrv_lock_medium(bs, locked);
19cb3738
FB
5298 }
5299}
985a03b0
TS
5300
5301/* needed for generic scsi interface */
5302
5303int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5304{
5305 BlockDriver *drv = bs->drv;
5306
5307 if (drv && drv->bdrv_ioctl)
5308 return drv->bdrv_ioctl(bs, req, buf);
5309 return -ENOTSUP;
5310}
7d780669 5311
221f715d
AL
5312BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5313 unsigned long int req, void *buf,
5314 BlockDriverCompletionFunc *cb, void *opaque)
7d780669 5315{
221f715d 5316 BlockDriver *drv = bs->drv;
7d780669 5317
221f715d
AL
5318 if (drv && drv->bdrv_aio_ioctl)
5319 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5320 return NULL;
7d780669 5321}
e268ca52 5322
1b7fd729 5323void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
7b6f9300 5324{
1b7fd729 5325 bs->guest_block_size = align;
7b6f9300 5326}
7cd1e32a 5327
e268ca52
AL
5328void *qemu_blockalign(BlockDriverState *bs, size_t size)
5329{
339064d5 5330 return qemu_memalign(bdrv_opt_mem_align(bs), size);
e268ca52 5331}
7cd1e32a 5332
7d2a35cc
KW
5333void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5334{
5335 size_t align = bdrv_opt_mem_align(bs);
5336
5337 /* Ensure that NULL is never returned on success */
5338 assert(align > 0);
5339 if (size == 0) {
5340 size = align;
5341 }
5342
5343 return qemu_try_memalign(align, size);
5344}
5345
c53b1c51
SH
5346/*
5347 * Check if all memory in this vector is sector aligned.
5348 */
5349bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5350{
5351 int i;
339064d5 5352 size_t alignment = bdrv_opt_mem_align(bs);
c53b1c51
SH
5353
5354 for (i = 0; i < qiov->niov; i++) {
339064d5 5355 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
c53b1c51 5356 return false;
1ff735bd 5357 }
339064d5 5358 if (qiov->iov[i].iov_len % alignment) {
1ff735bd 5359 return false;
c53b1c51
SH
5360 }
5361 }
5362
5363 return true;
5364}
5365
b8afb520
FZ
5366BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5367 Error **errp)
7cd1e32a
LS
5368{
5369 int64_t bitmap_size;
e4654d2d 5370 BdrvDirtyBitmap *bitmap;
a55eb92c 5371
50717e94
PB
5372 assert((granularity & (granularity - 1)) == 0);
5373
e4654d2d
FZ
5374 granularity >>= BDRV_SECTOR_BITS;
5375 assert(granularity);
57322b78 5376 bitmap_size = bdrv_nb_sectors(bs);
b8afb520
FZ
5377 if (bitmap_size < 0) {
5378 error_setg_errno(errp, -bitmap_size, "could not get length of device");
5379 errno = -bitmap_size;
5380 return NULL;
5381 }
5839e53b 5382 bitmap = g_new0(BdrvDirtyBitmap, 1);
e4654d2d
FZ
5383 bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5384 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5385 return bitmap;
5386}
5387
5388void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5389{
5390 BdrvDirtyBitmap *bm, *next;
5391 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5392 if (bm == bitmap) {
5393 QLIST_REMOVE(bitmap, list);
5394 hbitmap_free(bitmap->bitmap);
5395 g_free(bitmap);
5396 return;
a55eb92c 5397 }
7cd1e32a
LS
5398 }
5399}
5400
21b56835
FZ
5401BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5402{
5403 BdrvDirtyBitmap *bm;
5404 BlockDirtyInfoList *list = NULL;
5405 BlockDirtyInfoList **plist = &list;
5406
5407 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5839e53b
MA
5408 BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5409 BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
21b56835
FZ
5410 info->count = bdrv_get_dirty_count(bs, bm);
5411 info->granularity =
5412 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5413 entry->value = info;
5414 *plist = entry;
5415 plist = &entry->next;
5416 }
5417
5418 return list;
5419}
5420
e4654d2d 5421int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
7cd1e32a 5422{
e4654d2d
FZ
5423 if (bitmap) {
5424 return hbitmap_get(bitmap->bitmap, sector);
7cd1e32a
LS
5425 } else {
5426 return 0;
5427 }
5428}
5429
e4654d2d
FZ
5430void bdrv_dirty_iter_init(BlockDriverState *bs,
5431 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
1755da16 5432{
e4654d2d 5433 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
1755da16
PB
5434}
5435
5436void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5437 int nr_sectors)
5438{
e4654d2d
FZ
5439 BdrvDirtyBitmap *bitmap;
5440 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5441 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5442 }
1755da16
PB
5443}
5444
e4654d2d 5445void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
7cd1e32a 5446{
e4654d2d
FZ
5447 BdrvDirtyBitmap *bitmap;
5448 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5449 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5450 }
7cd1e32a 5451}
aaa0eb75 5452
e4654d2d 5453int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
aaa0eb75 5454{
e4654d2d 5455 return hbitmap_count(bitmap->bitmap);
aaa0eb75 5456}
f88e1a42 5457
9fcb0251
FZ
5458/* Get a reference to bs */
5459void bdrv_ref(BlockDriverState *bs)
5460{
5461 bs->refcnt++;
5462}
5463
5464/* Release a previously grabbed reference to bs.
5465 * If after releasing, reference count is zero, the BlockDriverState is
5466 * deleted. */
5467void bdrv_unref(BlockDriverState *bs)
5468{
9a4d5ca6
JC
5469 if (!bs) {
5470 return;
5471 }
9fcb0251
FZ
5472 assert(bs->refcnt > 0);
5473 if (--bs->refcnt == 0) {
5474 bdrv_delete(bs);
5475 }
5476}
5477
fbe40ff7
FZ
5478struct BdrvOpBlocker {
5479 Error *reason;
5480 QLIST_ENTRY(BdrvOpBlocker) list;
5481};
5482
5483bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5484{
5485 BdrvOpBlocker *blocker;
5486 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5487 if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5488 blocker = QLIST_FIRST(&bs->op_blockers[op]);
5489 if (errp) {
5490 error_setg(errp, "Device '%s' is busy: %s",
5491 bs->device_name, error_get_pretty(blocker->reason));
5492 }
5493 return true;
5494 }
5495 return false;
5496}
5497
5498void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5499{
5500 BdrvOpBlocker *blocker;
5501 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5502
5839e53b 5503 blocker = g_new0(BdrvOpBlocker, 1);
fbe40ff7
FZ
5504 blocker->reason = reason;
5505 QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5506}
5507
5508void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5509{
5510 BdrvOpBlocker *blocker, *next;
5511 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5512 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5513 if (blocker->reason == reason) {
5514 QLIST_REMOVE(blocker, list);
5515 g_free(blocker);
5516 }
5517 }
5518}
5519
5520void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5521{
5522 int i;
5523 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5524 bdrv_op_block(bs, i, reason);
5525 }
5526}
5527
5528void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5529{
5530 int i;
5531 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5532 bdrv_op_unblock(bs, i, reason);
5533 }
5534}
5535
5536bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5537{
5538 int i;
5539
5540 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5541 if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5542 return false;
5543 }
5544 }
5545 return true;
5546}
5547
28a7282a
LC
5548void bdrv_iostatus_enable(BlockDriverState *bs)
5549{
d6bf279e 5550 bs->iostatus_enabled = true;
58e21ef5 5551 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
28a7282a
LC
5552}
5553
5554/* The I/O status is only enabled if the drive explicitly
5555 * enables it _and_ the VM is configured to stop on errors */
5556bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5557{
d6bf279e 5558 return (bs->iostatus_enabled &&
92aa5c6d
PB
5559 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5560 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5561 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
28a7282a
LC
5562}
5563
5564void bdrv_iostatus_disable(BlockDriverState *bs)
5565{
d6bf279e 5566 bs->iostatus_enabled = false;
28a7282a
LC
5567}
5568
5569void bdrv_iostatus_reset(BlockDriverState *bs)
5570{
5571 if (bdrv_iostatus_is_enabled(bs)) {
58e21ef5 5572 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3bd293c3
PB
5573 if (bs->job) {
5574 block_job_iostatus_reset(bs->job);
5575 }
28a7282a
LC
5576 }
5577}
5578
28a7282a
LC
5579void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5580{
3e1caa5f
PB
5581 assert(bdrv_iostatus_is_enabled(bs));
5582 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
58e21ef5
LC
5583 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5584 BLOCK_DEVICE_IO_STATUS_FAILED;
28a7282a
LC
5585 }
5586}
5587
d92ada22
LC
5588void bdrv_img_create(const char *filename, const char *fmt,
5589 const char *base_filename, const char *base_fmt,
f382d43a
MR
5590 char *options, uint64_t img_size, int flags,
5591 Error **errp, bool quiet)
f88e1a42 5592{
83d0521a
CL
5593 QemuOptsList *create_opts = NULL;
5594 QemuOpts *opts = NULL;
5595 const char *backing_fmt, *backing_file;
5596 int64_t size;
f88e1a42 5597 BlockDriver *drv, *proto_drv;
96df67d1 5598 BlockDriver *backing_drv = NULL;
cc84d90f 5599 Error *local_err = NULL;
f88e1a42
JS
5600 int ret = 0;
5601
5602 /* Find driver and parse its options */
5603 drv = bdrv_find_format(fmt);
5604 if (!drv) {
71c79813 5605 error_setg(errp, "Unknown file format '%s'", fmt);
d92ada22 5606 return;
f88e1a42
JS
5607 }
5608
98289620 5609 proto_drv = bdrv_find_protocol(filename, true);
f88e1a42 5610 if (!proto_drv) {
71c79813 5611 error_setg(errp, "Unknown protocol '%s'", filename);
d92ada22 5612 return;
f88e1a42
JS
5613 }
5614
c282e1fd
CL
5615 create_opts = qemu_opts_append(create_opts, drv->create_opts);
5616 create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
f88e1a42
JS
5617
5618 /* Create parameter list with default values */
83d0521a
CL
5619 opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5620 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size);
f88e1a42
JS
5621
5622 /* Parse -o options */
5623 if (options) {
83d0521a
CL
5624 if (qemu_opts_do_parse(opts, options, NULL) != 0) {
5625 error_setg(errp, "Invalid options for file format '%s'", fmt);
f88e1a42
JS
5626 goto out;
5627 }
5628 }
5629
5630 if (base_filename) {
83d0521a 5631 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename)) {
71c79813
LC
5632 error_setg(errp, "Backing file not supported for file format '%s'",
5633 fmt);
f88e1a42
JS
5634 goto out;
5635 }
5636 }
5637
5638 if (base_fmt) {
83d0521a 5639 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt)) {
71c79813
LC
5640 error_setg(errp, "Backing file format not supported for file "
5641 "format '%s'", fmt);
f88e1a42
JS
5642 goto out;
5643 }
5644 }
5645
83d0521a
CL
5646 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5647 if (backing_file) {
5648 if (!strcmp(filename, backing_file)) {
71c79813
LC
5649 error_setg(errp, "Error: Trying to create an image with the "
5650 "same filename as the backing file");
792da93a
JS
5651 goto out;
5652 }
5653 }
5654
83d0521a
CL
5655 backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5656 if (backing_fmt) {
5657 backing_drv = bdrv_find_format(backing_fmt);
96df67d1 5658 if (!backing_drv) {
71c79813 5659 error_setg(errp, "Unknown backing file format '%s'",
83d0521a 5660 backing_fmt);
f88e1a42
JS
5661 goto out;
5662 }
5663 }
5664
5665 // The size for the image must always be specified, with one exception:
5666 // If we are using a backing file, we can obtain the size from there
83d0521a
CL
5667 size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5668 if (size == -1) {
5669 if (backing_file) {
66f6b814 5670 BlockDriverState *bs;
52bf1e72 5671 int64_t size;
63090dac
PB
5672 int back_flags;
5673
5674 /* backing files always opened read-only */
5675 back_flags =
5676 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
f88e1a42 5677
f67503e5 5678 bs = NULL;
83d0521a 5679 ret = bdrv_open(&bs, backing_file, NULL, NULL, back_flags,
cc84d90f 5680 backing_drv, &local_err);
f88e1a42 5681 if (ret < 0) {
cc84d90f 5682 error_setg_errno(errp, -ret, "Could not open '%s': %s",
83d0521a 5683 backing_file,
cc84d90f
HR
5684 error_get_pretty(local_err));
5685 error_free(local_err);
5686 local_err = NULL;
f88e1a42
JS
5687 goto out;
5688 }
52bf1e72
MA
5689 size = bdrv_getlength(bs);
5690 if (size < 0) {
5691 error_setg_errno(errp, -size, "Could not get size of '%s'",
5692 backing_file);
5693 bdrv_unref(bs);
5694 goto out;
5695 }
f88e1a42 5696
83d0521a 5697 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size);
66f6b814
HR
5698
5699 bdrv_unref(bs);
f88e1a42 5700 } else {
71c79813 5701 error_setg(errp, "Image creation needs a size parameter");
f88e1a42
JS
5702 goto out;
5703 }
5704 }
5705
f382d43a
MR
5706 if (!quiet) {
5707 printf("Formatting '%s', fmt=%s ", filename, fmt);
83d0521a 5708 qemu_opts_print(opts);
f382d43a
MR
5709 puts("");
5710 }
83d0521a 5711
c282e1fd 5712 ret = bdrv_create(drv, filename, opts, &local_err);
83d0521a 5713
cc84d90f
HR
5714 if (ret == -EFBIG) {
5715 /* This is generally a better message than whatever the driver would
5716 * deliver (especially because of the cluster_size_hint), since that
5717 * is most probably not much different from "image too large". */
5718 const char *cluster_size_hint = "";
83d0521a 5719 if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
cc84d90f 5720 cluster_size_hint = " (try using a larger cluster size)";
f88e1a42 5721 }
cc84d90f
HR
5722 error_setg(errp, "The image size is too large for file format '%s'"
5723 "%s", fmt, cluster_size_hint);
5724 error_free(local_err);
5725 local_err = NULL;
f88e1a42
JS
5726 }
5727
5728out:
83d0521a
CL
5729 qemu_opts_del(opts);
5730 qemu_opts_free(create_opts);
84d18f06 5731 if (local_err) {
cc84d90f
HR
5732 error_propagate(errp, local_err);
5733 }
f88e1a42 5734}
85d126f3
SH
5735
5736AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5737{
dcd04228
SH
5738 return bs->aio_context;
5739}
5740
5741void bdrv_detach_aio_context(BlockDriverState *bs)
5742{
33384421
HR
5743 BdrvAioNotifier *baf;
5744
dcd04228
SH
5745 if (!bs->drv) {
5746 return;
5747 }
5748
33384421
HR
5749 QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
5750 baf->detach_aio_context(baf->opaque);
5751 }
5752
13af91eb
SH
5753 if (bs->io_limits_enabled) {
5754 throttle_detach_aio_context(&bs->throttle_state);
5755 }
dcd04228
SH
5756 if (bs->drv->bdrv_detach_aio_context) {
5757 bs->drv->bdrv_detach_aio_context(bs);
5758 }
5759 if (bs->file) {
5760 bdrv_detach_aio_context(bs->file);
5761 }
5762 if (bs->backing_hd) {
5763 bdrv_detach_aio_context(bs->backing_hd);
5764 }
5765
5766 bs->aio_context = NULL;
5767}
5768
5769void bdrv_attach_aio_context(BlockDriverState *bs,
5770 AioContext *new_context)
5771{
33384421
HR
5772 BdrvAioNotifier *ban;
5773
dcd04228
SH
5774 if (!bs->drv) {
5775 return;
5776 }
5777
5778 bs->aio_context = new_context;
5779
5780 if (bs->backing_hd) {
5781 bdrv_attach_aio_context(bs->backing_hd, new_context);
5782 }
5783 if (bs->file) {
5784 bdrv_attach_aio_context(bs->file, new_context);
5785 }
5786 if (bs->drv->bdrv_attach_aio_context) {
5787 bs->drv->bdrv_attach_aio_context(bs, new_context);
5788 }
13af91eb
SH
5789 if (bs->io_limits_enabled) {
5790 throttle_attach_aio_context(&bs->throttle_state, new_context);
5791 }
33384421
HR
5792
5793 QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
5794 ban->attached_aio_context(new_context, ban->opaque);
5795 }
dcd04228
SH
5796}
5797
5798void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5799{
5800 bdrv_drain_all(); /* ensure there are no in-flight requests */
5801
5802 bdrv_detach_aio_context(bs);
5803
5804 /* This function executes in the old AioContext so acquire the new one in
5805 * case it runs in a different thread.
5806 */
5807 aio_context_acquire(new_context);
5808 bdrv_attach_aio_context(bs, new_context);
5809 aio_context_release(new_context);
85d126f3 5810}
d616b224 5811
33384421
HR
5812void bdrv_add_aio_context_notifier(BlockDriverState *bs,
5813 void (*attached_aio_context)(AioContext *new_context, void *opaque),
5814 void (*detach_aio_context)(void *opaque), void *opaque)
5815{
5816 BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
5817 *ban = (BdrvAioNotifier){
5818 .attached_aio_context = attached_aio_context,
5819 .detach_aio_context = detach_aio_context,
5820 .opaque = opaque
5821 };
5822
5823 QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
5824}
5825
5826void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
5827 void (*attached_aio_context)(AioContext *,
5828 void *),
5829 void (*detach_aio_context)(void *),
5830 void *opaque)
5831{
5832 BdrvAioNotifier *ban, *ban_next;
5833
5834 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
5835 if (ban->attached_aio_context == attached_aio_context &&
5836 ban->detach_aio_context == detach_aio_context &&
5837 ban->opaque == opaque)
5838 {
5839 QLIST_REMOVE(ban, list);
5840 g_free(ban);
5841
5842 return;
5843 }
5844 }
5845
5846 abort();
5847}
5848
d616b224
SH
5849void bdrv_add_before_write_notifier(BlockDriverState *bs,
5850 NotifierWithReturn *notifier)
5851{
5852 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5853}
6f176b48 5854
c282e1fd 5855int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts)
6f176b48 5856{
c282e1fd 5857 if (!bs->drv->bdrv_amend_options) {
6f176b48
HR
5858 return -ENOTSUP;
5859 }
c282e1fd 5860 return bs->drv->bdrv_amend_options(bs, opts);
6f176b48 5861}
f6186f49 5862
b5042a36
BC
5863/* This function will be called by the bdrv_recurse_is_first_non_filter method
5864 * of block filter and by bdrv_is_first_non_filter.
5865 * It is used to test if the given bs is the candidate or recurse more in the
5866 * node graph.
212a5a8f 5867 */
b5042a36 5868bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
212a5a8f 5869 BlockDriverState *candidate)
f6186f49 5870{
b5042a36
BC
5871 /* return false if basic checks fails */
5872 if (!bs || !bs->drv) {
212a5a8f 5873 return false;
f6186f49
BC
5874 }
5875
b5042a36
BC
5876 /* the code reached a non block filter driver -> check if the bs is
5877 * the same as the candidate. It's the recursion termination condition.
5878 */
5879 if (!bs->drv->is_filter) {
5880 return bs == candidate;
212a5a8f 5881 }
b5042a36 5882 /* Down this path the driver is a block filter driver */
212a5a8f 5883
b5042a36
BC
5884 /* If the block filter recursion method is defined use it to recurse down
5885 * the node graph.
5886 */
5887 if (bs->drv->bdrv_recurse_is_first_non_filter) {
212a5a8f 5888 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
f6186f49
BC
5889 }
5890
b5042a36
BC
5891 /* the driver is a block filter but don't allow to recurse -> return false
5892 */
5893 return false;
f6186f49
BC
5894}
5895
212a5a8f
BC
5896/* This function checks if the candidate is the first non filter bs down it's
5897 * bs chain. Since we don't have pointers to parents it explore all bs chains
5898 * from the top. Some filters can choose not to pass down the recursion.
5899 */
5900bool bdrv_is_first_non_filter(BlockDriverState *candidate)
f6186f49 5901{
212a5a8f
BC
5902 BlockDriverState *bs;
5903
5904 /* walk down the bs forest recursively */
5905 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5906 bool perm;
5907
b5042a36 5908 /* try to recurse in this top level bs */
e6dc8a1f 5909 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
212a5a8f
BC
5910
5911 /* candidate is the first non filter */
5912 if (perm) {
5913 return true;
5914 }
5915 }
5916
5917 return false;
f6186f49 5918}
09158f00
BC
5919
5920BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
5921{
5922 BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
5923 if (!to_replace_bs) {
5924 error_setg(errp, "Node name '%s' not found", node_name);
5925 return NULL;
5926 }
5927
5928 if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
5929 return NULL;
5930 }
5931
5932 /* We don't want arbitrary node of the BDS chain to be replaced only the top
5933 * most non filter in order to prevent data corruption.
5934 * Another benefit is that this tests exclude backing files which are
5935 * blocked by the backing blockers.
5936 */
5937 if (!bdrv_is_first_non_filter(to_replace_bs)) {
5938 error_setg(errp, "Only top most non filter can be replaced");
5939 return NULL;
5940 }
5941
5942 return to_replace_bs;
5943}
448ad91d
ML
5944
5945void bdrv_io_plug(BlockDriverState *bs)
5946{
5947 BlockDriver *drv = bs->drv;
5948 if (drv && drv->bdrv_io_plug) {
5949 drv->bdrv_io_plug(bs);
5950 } else if (bs->file) {
5951 bdrv_io_plug(bs->file);
5952 }
5953}
5954
5955void bdrv_io_unplug(BlockDriverState *bs)
5956{
5957 BlockDriver *drv = bs->drv;
5958 if (drv && drv->bdrv_io_unplug) {
5959 drv->bdrv_io_unplug(bs);
5960 } else if (bs->file) {
5961 bdrv_io_unplug(bs->file);
5962 }
5963}
5964
5965void bdrv_flush_io_queue(BlockDriverState *bs)
5966{
5967 BlockDriver *drv = bs->drv;
5968 if (drv && drv->bdrv_flush_io_queue) {
5969 drv->bdrv_flush_io_queue(bs);
5970 } else if (bs->file) {
5971 bdrv_flush_io_queue(bs->file);
5972 }
5973}
91af7014
HR
5974
5975static bool append_open_options(QDict *d, BlockDriverState *bs)
5976{
5977 const QDictEntry *entry;
5978 bool found_any = false;
5979
5980 for (entry = qdict_first(bs->options); entry;
5981 entry = qdict_next(bs->options, entry))
5982 {
5983 /* Only take options for this level and exclude all non-driver-specific
5984 * options */
5985 if (!strchr(qdict_entry_key(entry), '.') &&
5986 strcmp(qdict_entry_key(entry), "node-name"))
5987 {
5988 qobject_incref(qdict_entry_value(entry));
5989 qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
5990 found_any = true;
5991 }
5992 }
5993
5994 return found_any;
5995}
5996
5997/* Updates the following BDS fields:
5998 * - exact_filename: A filename which may be used for opening a block device
5999 * which (mostly) equals the given BDS (even without any
6000 * other options; so reading and writing must return the same
6001 * results, but caching etc. may be different)
6002 * - full_open_options: Options which, when given when opening a block device
6003 * (without a filename), result in a BDS (mostly)
6004 * equalling the given one
6005 * - filename: If exact_filename is set, it is copied here. Otherwise,
6006 * full_open_options is converted to a JSON object, prefixed with
6007 * "json:" (for use through the JSON pseudo protocol) and put here.
6008 */
6009void bdrv_refresh_filename(BlockDriverState *bs)
6010{
6011 BlockDriver *drv = bs->drv;
6012 QDict *opts;
6013
6014 if (!drv) {
6015 return;
6016 }
6017
6018 /* This BDS's file name will most probably depend on its file's name, so
6019 * refresh that first */
6020 if (bs->file) {
6021 bdrv_refresh_filename(bs->file);
6022 }
6023
6024 if (drv->bdrv_refresh_filename) {
6025 /* Obsolete information is of no use here, so drop the old file name
6026 * information before refreshing it */
6027 bs->exact_filename[0] = '\0';
6028 if (bs->full_open_options) {
6029 QDECREF(bs->full_open_options);
6030 bs->full_open_options = NULL;
6031 }
6032
6033 drv->bdrv_refresh_filename(bs);
6034 } else if (bs->file) {
6035 /* Try to reconstruct valid information from the underlying file */
6036 bool has_open_options;
6037
6038 bs->exact_filename[0] = '\0';
6039 if (bs->full_open_options) {
6040 QDECREF(bs->full_open_options);
6041 bs->full_open_options = NULL;
6042 }
6043
6044 opts = qdict_new();
6045 has_open_options = append_open_options(opts, bs);
6046
6047 /* If no specific options have been given for this BDS, the filename of
6048 * the underlying file should suffice for this one as well */
6049 if (bs->file->exact_filename[0] && !has_open_options) {
6050 strcpy(bs->exact_filename, bs->file->exact_filename);
6051 }
6052 /* Reconstructing the full options QDict is simple for most format block
6053 * drivers, as long as the full options are known for the underlying
6054 * file BDS. The full options QDict of that file BDS should somehow
6055 * contain a representation of the filename, therefore the following
6056 * suffices without querying the (exact_)filename of this BDS. */
6057 if (bs->file->full_open_options) {
6058 qdict_put_obj(opts, "driver",
6059 QOBJECT(qstring_from_str(drv->format_name)));
6060 QINCREF(bs->file->full_open_options);
6061 qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
6062
6063 bs->full_open_options = opts;
6064 } else {
6065 QDECREF(opts);
6066 }
6067 } else if (!bs->full_open_options && qdict_size(bs->options)) {
6068 /* There is no underlying file BDS (at least referenced by BDS.file),
6069 * so the full options QDict should be equal to the options given
6070 * specifically for this block device when it was opened (plus the
6071 * driver specification).
6072 * Because those options don't change, there is no need to update
6073 * full_open_options when it's already set. */
6074
6075 opts = qdict_new();
6076 append_open_options(opts, bs);
6077 qdict_put_obj(opts, "driver",
6078 QOBJECT(qstring_from_str(drv->format_name)));
6079
6080 if (bs->exact_filename[0]) {
6081 /* This may not work for all block protocol drivers (some may
6082 * require this filename to be parsed), but we have to find some
6083 * default solution here, so just include it. If some block driver
6084 * does not support pure options without any filename at all or
6085 * needs some special format of the options QDict, it needs to
6086 * implement the driver-specific bdrv_refresh_filename() function.
6087 */
6088 qdict_put_obj(opts, "filename",
6089 QOBJECT(qstring_from_str(bs->exact_filename)));
6090 }
6091
6092 bs->full_open_options = opts;
6093 }
6094
6095 if (bs->exact_filename[0]) {
6096 pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
6097 } else if (bs->full_open_options) {
6098 QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6099 snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6100 qstring_get_str(json));
6101 QDECREF(json);
6102 }
6103}
5366d0c8
BC
6104
6105/* This accessor function purpose is to allow the device models to access the
6106 * BlockAcctStats structure embedded inside a BlockDriverState without being
6107 * aware of the BlockDriverState structure layout.
6108 * It will go away when the BlockAcctStats structure will be moved inside
6109 * the device models.
6110 */
6111BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
6112{
6113 return &bs->stats;
6114}