]> git.proxmox.com Git - qemu.git/blame - block.c
qed: track dirty flag status
[qemu.git] / block.c
CommitLineData
fc01f7e7
FB
1/*
2 * QEMU System Emulator block driver
5fafdf24 3 *
fc01f7e7 4 * Copyright (c) 2003 Fabrice Bellard
5fafdf24 5 *
fc01f7e7
FB
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
3990d09a 24#include "config-host.h"
faf07963 25#include "qemu-common.h"
6d519a5f 26#include "trace.h"
376253ec 27#include "monitor.h"
ea2384d3 28#include "block_int.h"
5efa9d5a 29#include "module.h"
f795e743 30#include "qjson.h"
68485420 31#include "qemu-coroutine.h"
b2023818 32#include "qmp-commands.h"
0563e191 33#include "qemu-timer.h"
fc01f7e7 34
71e72a19 35#ifdef CONFIG_BSD
7674e7bf
FB
36#include <sys/types.h>
37#include <sys/stat.h>
38#include <sys/ioctl.h>
72cf2d4f 39#include <sys/queue.h>
c5e97233 40#ifndef __DragonFly__
7674e7bf
FB
41#include <sys/disk.h>
42#endif
c5e97233 43#endif
7674e7bf 44
49dc768d
AL
45#ifdef _WIN32
46#include <windows.h>
47#endif
48
1c9805a3
SH
49#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
50
470c0504
SH
51typedef enum {
52 BDRV_REQ_COPY_ON_READ = 0x1,
f08f2dda 53 BDRV_REQ_ZERO_WRITE = 0x2,
470c0504
SH
54} BdrvRequestFlags;
55
7d4b4ba5 56static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
f141eafe
AL
57static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
c87c0672 59 BlockDriverCompletionFunc *cb, void *opaque);
f141eafe
AL
60static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 62 BlockDriverCompletionFunc *cb, void *opaque);
f9f05dc5
KW
63static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64 int64_t sector_num, int nb_sectors,
65 QEMUIOVector *iov);
66static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67 int64_t sector_num, int nb_sectors,
68 QEMUIOVector *iov);
c5fbe571 69static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
470c0504
SH
70 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71 BdrvRequestFlags flags);
1c9805a3 72static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
f08f2dda
SH
73 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74 BdrvRequestFlags flags);
b2a61371
SH
75static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
76 int64_t sector_num,
77 QEMUIOVector *qiov,
78 int nb_sectors,
79 BlockDriverCompletionFunc *cb,
80 void *opaque,
8c5873d6 81 bool is_write);
b2a61371 82static void coroutine_fn bdrv_co_do_rw(void *opaque);
ec530c81 83
98f90dba
ZYW
84static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
85 bool is_write, double elapsed_time, uint64_t *wait);
86static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
87 double elapsed_time, uint64_t *wait);
88static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
89 bool is_write, int64_t *wait);
90
1b7bdbc1
SH
91static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92 QTAILQ_HEAD_INITIALIZER(bdrv_states);
7ee930d0 93
8a22f02a
SH
94static QLIST_HEAD(, BlockDriver) bdrv_drivers =
95 QLIST_HEAD_INITIALIZER(bdrv_drivers);
ea2384d3 96
f9092b10
MA
97/* The device to use for VM snapshots */
98static BlockDriverState *bs_snapshots;
99
eb852011
MA
100/* If non-zero, use only whitelisted block drivers */
101static int use_bdrv_whitelist;
102
9e0b22f4
SH
103#ifdef _WIN32
104static int is_windows_drive_prefix(const char *filename)
105{
106 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108 filename[1] == ':');
109}
110
111int is_windows_drive(const char *filename)
112{
113 if (is_windows_drive_prefix(filename) &&
114 filename[2] == '\0')
115 return 1;
116 if (strstart(filename, "\\\\.\\", NULL) ||
117 strstart(filename, "//./", NULL))
118 return 1;
119 return 0;
120}
121#endif
122
0563e191 123/* throttling disk I/O limits */
98f90dba
ZYW
124void bdrv_io_limits_disable(BlockDriverState *bs)
125{
126 bs->io_limits_enabled = false;
127
128 while (qemu_co_queue_next(&bs->throttled_reqs));
129
130 if (bs->block_timer) {
131 qemu_del_timer(bs->block_timer);
132 qemu_free_timer(bs->block_timer);
133 bs->block_timer = NULL;
134 }
135
136 bs->slice_start = 0;
137 bs->slice_end = 0;
138 bs->slice_time = 0;
139 memset(&bs->io_base, 0, sizeof(bs->io_base));
140}
141
0563e191
ZYW
142static void bdrv_block_timer(void *opaque)
143{
144 BlockDriverState *bs = opaque;
145
146 qemu_co_queue_next(&bs->throttled_reqs);
147}
148
149void bdrv_io_limits_enable(BlockDriverState *bs)
150{
151 qemu_co_queue_init(&bs->throttled_reqs);
152 bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
153 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
154 bs->slice_start = qemu_get_clock_ns(vm_clock);
155 bs->slice_end = bs->slice_start + bs->slice_time;
156 memset(&bs->io_base, 0, sizeof(bs->io_base));
157 bs->io_limits_enabled = true;
158}
159
160bool bdrv_io_limits_enabled(BlockDriverState *bs)
161{
162 BlockIOLimit *io_limits = &bs->io_limits;
163 return io_limits->bps[BLOCK_IO_LIMIT_READ]
164 || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
165 || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
166 || io_limits->iops[BLOCK_IO_LIMIT_READ]
167 || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
168 || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
169}
170
98f90dba
ZYW
171static void bdrv_io_limits_intercept(BlockDriverState *bs,
172 bool is_write, int nb_sectors)
173{
174 int64_t wait_time = -1;
175
176 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
177 qemu_co_queue_wait(&bs->throttled_reqs);
178 }
179
180 /* In fact, we hope to keep each request's timing, in FIFO mode. The next
181 * throttled requests will not be dequeued until the current request is
182 * allowed to be serviced. So if the current request still exceeds the
183 * limits, it will be inserted to the head. All requests followed it will
184 * be still in throttled_reqs queue.
185 */
186
187 while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
188 qemu_mod_timer(bs->block_timer,
189 wait_time + qemu_get_clock_ns(vm_clock));
190 qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
191 }
192
193 qemu_co_queue_next(&bs->throttled_reqs);
194}
195
9e0b22f4
SH
196/* check if the path starts with "<protocol>:" */
197static int path_has_protocol(const char *path)
198{
199#ifdef _WIN32
200 if (is_windows_drive(path) ||
201 is_windows_drive_prefix(path)) {
202 return 0;
203 }
204#endif
205
206 return strchr(path, ':') != NULL;
207}
208
83f64091 209int path_is_absolute(const char *path)
3b0d4f61 210{
83f64091 211 const char *p;
21664424
FB
212#ifdef _WIN32
213 /* specific case for names like: "\\.\d:" */
214 if (*path == '/' || *path == '\\')
215 return 1;
216#endif
83f64091
FB
217 p = strchr(path, ':');
218 if (p)
219 p++;
220 else
221 p = path;
3b9f94e1
FB
222#ifdef _WIN32
223 return (*p == '/' || *p == '\\');
224#else
225 return (*p == '/');
226#endif
3b0d4f61
FB
227}
228
83f64091
FB
229/* if filename is absolute, just copy it to dest. Otherwise, build a
230 path to it by considering it is relative to base_path. URL are
231 supported. */
232void path_combine(char *dest, int dest_size,
233 const char *base_path,
234 const char *filename)
3b0d4f61 235{
83f64091
FB
236 const char *p, *p1;
237 int len;
238
239 if (dest_size <= 0)
240 return;
241 if (path_is_absolute(filename)) {
242 pstrcpy(dest, dest_size, filename);
243 } else {
244 p = strchr(base_path, ':');
245 if (p)
246 p++;
247 else
248 p = base_path;
3b9f94e1
FB
249 p1 = strrchr(base_path, '/');
250#ifdef _WIN32
251 {
252 const char *p2;
253 p2 = strrchr(base_path, '\\');
254 if (!p1 || p2 > p1)
255 p1 = p2;
256 }
257#endif
83f64091
FB
258 if (p1)
259 p1++;
260 else
261 p1 = base_path;
262 if (p1 > p)
263 p = p1;
264 len = p - base_path;
265 if (len > dest_size - 1)
266 len = dest_size - 1;
267 memcpy(dest, base_path, len);
268 dest[len] = '\0';
269 pstrcat(dest, dest_size, filename);
3b0d4f61 270 }
3b0d4f61
FB
271}
272
5efa9d5a 273void bdrv_register(BlockDriver *bdrv)
ea2384d3 274{
8c5873d6
SH
275 /* Block drivers without coroutine functions need emulation */
276 if (!bdrv->bdrv_co_readv) {
f9f05dc5
KW
277 bdrv->bdrv_co_readv = bdrv_co_readv_em;
278 bdrv->bdrv_co_writev = bdrv_co_writev_em;
279
f8c35c1d
SH
280 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
281 * the block driver lacks aio we need to emulate that too.
282 */
f9f05dc5
KW
283 if (!bdrv->bdrv_aio_readv) {
284 /* add AIO emulation layer */
285 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
286 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
f9f05dc5 287 }
83f64091 288 }
b2e12bc6 289
8a22f02a 290 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
ea2384d3 291}
b338082b
FB
292
293/* create a new block device (by default it is empty) */
294BlockDriverState *bdrv_new(const char *device_name)
295{
1b7bdbc1 296 BlockDriverState *bs;
b338082b 297
7267c094 298 bs = g_malloc0(sizeof(BlockDriverState));
b338082b 299 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
ea2384d3 300 if (device_name[0] != '\0') {
1b7bdbc1 301 QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
ea2384d3 302 }
28a7282a 303 bdrv_iostatus_disable(bs);
b338082b
FB
304 return bs;
305}
306
ea2384d3
FB
307BlockDriver *bdrv_find_format(const char *format_name)
308{
309 BlockDriver *drv1;
8a22f02a
SH
310 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
311 if (!strcmp(drv1->format_name, format_name)) {
ea2384d3 312 return drv1;
8a22f02a 313 }
ea2384d3
FB
314 }
315 return NULL;
316}
317
eb852011
MA
318static int bdrv_is_whitelisted(BlockDriver *drv)
319{
320 static const char *whitelist[] = {
321 CONFIG_BDRV_WHITELIST
322 };
323 const char **p;
324
325 if (!whitelist[0])
326 return 1; /* no whitelist, anything goes */
327
328 for (p = whitelist; *p; p++) {
329 if (!strcmp(drv->format_name, *p)) {
330 return 1;
331 }
332 }
333 return 0;
334}
335
336BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
337{
338 BlockDriver *drv = bdrv_find_format(format_name);
339 return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
340}
341
0e7e1989
KW
342int bdrv_create(BlockDriver *drv, const char* filename,
343 QEMUOptionParameter *options)
ea2384d3
FB
344{
345 if (!drv->bdrv_create)
346 return -ENOTSUP;
0e7e1989
KW
347
348 return drv->bdrv_create(filename, options);
ea2384d3
FB
349}
350
84a12e66
CH
351int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
352{
353 BlockDriver *drv;
354
b50cbabc 355 drv = bdrv_find_protocol(filename);
84a12e66 356 if (drv == NULL) {
16905d71 357 return -ENOENT;
84a12e66
CH
358 }
359
360 return bdrv_create(drv, filename, options);
361}
362
d5249393 363#ifdef _WIN32
95389c86 364void get_tmp_filename(char *filename, int size)
d5249393 365{
3b9f94e1 366 char temp_dir[MAX_PATH];
3b46e624 367
3b9f94e1
FB
368 GetTempPath(MAX_PATH, temp_dir);
369 GetTempFileName(temp_dir, "qem", 0, filename);
d5249393
FB
370}
371#else
95389c86 372void get_tmp_filename(char *filename, int size)
fc01f7e7 373{
67b915a5 374 int fd;
7ccfb2eb 375 const char *tmpdir;
d5249393 376 /* XXX: race condition possible */
0badc1ee
AJ
377 tmpdir = getenv("TMPDIR");
378 if (!tmpdir)
379 tmpdir = "/tmp";
380 snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
ea2384d3
FB
381 fd = mkstemp(filename);
382 close(fd);
383}
d5249393 384#endif
fc01f7e7 385
84a12e66
CH
386/*
387 * Detect host devices. By convention, /dev/cdrom[N] is always
388 * recognized as a host CDROM.
389 */
390static BlockDriver *find_hdev_driver(const char *filename)
391{
392 int score_max = 0, score;
393 BlockDriver *drv = NULL, *d;
394
395 QLIST_FOREACH(d, &bdrv_drivers, list) {
396 if (d->bdrv_probe_device) {
397 score = d->bdrv_probe_device(filename);
398 if (score > score_max) {
399 score_max = score;
400 drv = d;
401 }
402 }
403 }
404
405 return drv;
406}
407
b50cbabc 408BlockDriver *bdrv_find_protocol(const char *filename)
83f64091
FB
409{
410 BlockDriver *drv1;
411 char protocol[128];
1cec71e3 412 int len;
83f64091 413 const char *p;
19cb3738 414
66f82cee
KW
415 /* TODO Drivers without bdrv_file_open must be specified explicitly */
416
39508e7a
CH
417 /*
418 * XXX(hch): we really should not let host device detection
419 * override an explicit protocol specification, but moving this
420 * later breaks access to device names with colons in them.
421 * Thanks to the brain-dead persistent naming schemes on udev-
422 * based Linux systems those actually are quite common.
423 */
424 drv1 = find_hdev_driver(filename);
425 if (drv1) {
426 return drv1;
427 }
428
9e0b22f4 429 if (!path_has_protocol(filename)) {
39508e7a 430 return bdrv_find_format("file");
84a12e66 431 }
9e0b22f4
SH
432 p = strchr(filename, ':');
433 assert(p != NULL);
1cec71e3
AL
434 len = p - filename;
435 if (len > sizeof(protocol) - 1)
436 len = sizeof(protocol) - 1;
437 memcpy(protocol, filename, len);
438 protocol[len] = '\0';
8a22f02a 439 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
5fafdf24 440 if (drv1->protocol_name &&
8a22f02a 441 !strcmp(drv1->protocol_name, protocol)) {
83f64091 442 return drv1;
8a22f02a 443 }
83f64091
FB
444 }
445 return NULL;
446}
447
c98ac35d 448static int find_image_format(const char *filename, BlockDriver **pdrv)
f3a5d3f8
CH
449{
450 int ret, score, score_max;
451 BlockDriver *drv1, *drv;
452 uint8_t buf[2048];
453 BlockDriverState *bs;
454
f5edb014 455 ret = bdrv_file_open(&bs, filename, 0);
c98ac35d
SW
456 if (ret < 0) {
457 *pdrv = NULL;
458 return ret;
459 }
f8ea0b00 460
08a00559
KW
461 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
462 if (bs->sg || !bdrv_is_inserted(bs)) {
1a396859 463 bdrv_delete(bs);
c98ac35d
SW
464 drv = bdrv_find_format("raw");
465 if (!drv) {
466 ret = -ENOENT;
467 }
468 *pdrv = drv;
469 return ret;
1a396859 470 }
f8ea0b00 471
83f64091
FB
472 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
473 bdrv_delete(bs);
474 if (ret < 0) {
c98ac35d
SW
475 *pdrv = NULL;
476 return ret;
83f64091
FB
477 }
478
ea2384d3 479 score_max = 0;
84a12e66 480 drv = NULL;
8a22f02a 481 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
83f64091
FB
482 if (drv1->bdrv_probe) {
483 score = drv1->bdrv_probe(buf, ret, filename);
484 if (score > score_max) {
485 score_max = score;
486 drv = drv1;
487 }
0849bf08 488 }
fc01f7e7 489 }
c98ac35d
SW
490 if (!drv) {
491 ret = -ENOENT;
492 }
493 *pdrv = drv;
494 return ret;
ea2384d3
FB
495}
496
51762288
SH
497/**
498 * Set the current 'total_sectors' value
499 */
500static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
501{
502 BlockDriver *drv = bs->drv;
503
396759ad
NB
504 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
505 if (bs->sg)
506 return 0;
507
51762288
SH
508 /* query actual device if possible, otherwise just trust the hint */
509 if (drv->bdrv_getlength) {
510 int64_t length = drv->bdrv_getlength(bs);
511 if (length < 0) {
512 return length;
513 }
514 hint = length >> BDRV_SECTOR_BITS;
515 }
516
517 bs->total_sectors = hint;
518 return 0;
519}
520
c3993cdc
SH
521/**
522 * Set open flags for a given cache mode
523 *
524 * Return 0 on success, -1 if the cache mode was invalid.
525 */
526int bdrv_parse_cache_flags(const char *mode, int *flags)
527{
528 *flags &= ~BDRV_O_CACHE_MASK;
529
530 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
531 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
92196b2f
SH
532 } else if (!strcmp(mode, "directsync")) {
533 *flags |= BDRV_O_NOCACHE;
c3993cdc
SH
534 } else if (!strcmp(mode, "writeback")) {
535 *flags |= BDRV_O_CACHE_WB;
536 } else if (!strcmp(mode, "unsafe")) {
537 *flags |= BDRV_O_CACHE_WB;
538 *flags |= BDRV_O_NO_FLUSH;
539 } else if (!strcmp(mode, "writethrough")) {
540 /* this is the default */
541 } else {
542 return -1;
543 }
544
545 return 0;
546}
547
53fec9d3
SH
548/**
549 * The copy-on-read flag is actually a reference count so multiple users may
550 * use the feature without worrying about clobbering its previous state.
551 * Copy-on-read stays enabled until all users have called to disable it.
552 */
553void bdrv_enable_copy_on_read(BlockDriverState *bs)
554{
555 bs->copy_on_read++;
556}
557
558void bdrv_disable_copy_on_read(BlockDriverState *bs)
559{
560 assert(bs->copy_on_read > 0);
561 bs->copy_on_read--;
562}
563
57915332
KW
564/*
565 * Common part for opening disk images and files
566 */
567static int bdrv_open_common(BlockDriverState *bs, const char *filename,
568 int flags, BlockDriver *drv)
569{
570 int ret, open_flags;
571
572 assert(drv != NULL);
573
28dcee10
SH
574 trace_bdrv_open_common(bs, filename, flags, drv->format_name);
575
66f82cee 576 bs->file = NULL;
51762288 577 bs->total_sectors = 0;
57915332
KW
578 bs->encrypted = 0;
579 bs->valid_key = 0;
03f541bd 580 bs->sg = 0;
57915332 581 bs->open_flags = flags;
03f541bd 582 bs->growable = 0;
57915332
KW
583 bs->buffer_alignment = 512;
584
53fec9d3
SH
585 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
586 if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
587 bdrv_enable_copy_on_read(bs);
588 }
589
57915332 590 pstrcpy(bs->filename, sizeof(bs->filename), filename);
03f541bd 591 bs->backing_file[0] = '\0';
57915332
KW
592
593 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
594 return -ENOTSUP;
595 }
596
597 bs->drv = drv;
7267c094 598 bs->opaque = g_malloc0(drv->instance_size);
57915332 599
03f541bd 600 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
57915332
KW
601
602 /*
603 * Clear flags that are internal to the block layer before opening the
604 * image.
605 */
606 open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
607
608 /*
ebabb67a 609 * Snapshots should be writable.
57915332
KW
610 */
611 if (bs->is_temporary) {
612 open_flags |= BDRV_O_RDWR;
613 }
614
e7c63796
SH
615 bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
616
66f82cee
KW
617 /* Open the image, either directly or using a protocol */
618 if (drv->bdrv_file_open) {
619 ret = drv->bdrv_file_open(bs, filename, open_flags);
620 } else {
621 ret = bdrv_file_open(&bs->file, filename, open_flags);
622 if (ret >= 0) {
623 ret = drv->bdrv_open(bs, open_flags);
624 }
625 }
626
57915332
KW
627 if (ret < 0) {
628 goto free_and_fail;
629 }
630
51762288
SH
631 ret = refresh_total_sectors(bs, bs->total_sectors);
632 if (ret < 0) {
633 goto free_and_fail;
57915332 634 }
51762288 635
57915332
KW
636#ifndef _WIN32
637 if (bs->is_temporary) {
638 unlink(filename);
639 }
640#endif
641 return 0;
642
643free_and_fail:
66f82cee
KW
644 if (bs->file) {
645 bdrv_delete(bs->file);
646 bs->file = NULL;
647 }
7267c094 648 g_free(bs->opaque);
57915332
KW
649 bs->opaque = NULL;
650 bs->drv = NULL;
651 return ret;
652}
653
b6ce07aa
KW
654/*
655 * Opens a file using a protocol (file, host_device, nbd, ...)
656 */
83f64091 657int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
ea2384d3 658{
83f64091 659 BlockDriverState *bs;
6db95603 660 BlockDriver *drv;
83f64091
FB
661 int ret;
662
b50cbabc 663 drv = bdrv_find_protocol(filename);
6db95603
CH
664 if (!drv) {
665 return -ENOENT;
666 }
667
83f64091 668 bs = bdrv_new("");
b6ce07aa 669 ret = bdrv_open_common(bs, filename, flags, drv);
83f64091
FB
670 if (ret < 0) {
671 bdrv_delete(bs);
672 return ret;
3b0d4f61 673 }
71d0770c 674 bs->growable = 1;
83f64091
FB
675 *pbs = bs;
676 return 0;
677}
678
b6ce07aa
KW
679/*
680 * Opens a disk image (raw, qcow2, vmdk, ...)
681 */
d6e9098e
KW
682int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
683 BlockDriver *drv)
ea2384d3 684{
b6ce07aa 685 int ret;
2b572816 686 char tmp_filename[PATH_MAX];
712e7874 687
83f64091 688 if (flags & BDRV_O_SNAPSHOT) {
ea2384d3
FB
689 BlockDriverState *bs1;
690 int64_t total_size;
7c96d46e 691 int is_protocol = 0;
91a073a9
KW
692 BlockDriver *bdrv_qcow2;
693 QEMUOptionParameter *options;
b6ce07aa 694 char backing_filename[PATH_MAX];
3b46e624 695
ea2384d3
FB
696 /* if snapshot, we create a temporary backing file and open it
697 instead of opening 'filename' directly */
33e3963e 698
ea2384d3
FB
699 /* if there is a backing file, use it */
700 bs1 = bdrv_new("");
d6e9098e 701 ret = bdrv_open(bs1, filename, 0, drv);
51d7c00c 702 if (ret < 0) {
ea2384d3 703 bdrv_delete(bs1);
51d7c00c 704 return ret;
ea2384d3 705 }
3e82990b 706 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
7c96d46e
AL
707
708 if (bs1->drv && bs1->drv->protocol_name)
709 is_protocol = 1;
710
ea2384d3 711 bdrv_delete(bs1);
3b46e624 712
ea2384d3 713 get_tmp_filename(tmp_filename, sizeof(tmp_filename));
7c96d46e
AL
714
715 /* Real path is meaningless for protocols */
716 if (is_protocol)
717 snprintf(backing_filename, sizeof(backing_filename),
718 "%s", filename);
114cdfa9
KS
719 else if (!realpath(filename, backing_filename))
720 return -errno;
7c96d46e 721
91a073a9
KW
722 bdrv_qcow2 = bdrv_find_format("qcow2");
723 options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
724
3e82990b 725 set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
91a073a9
KW
726 set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
727 if (drv) {
728 set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
729 drv->format_name);
730 }
731
732 ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
d748768c 733 free_option_parameters(options);
51d7c00c
AL
734 if (ret < 0) {
735 return ret;
ea2384d3 736 }
91a073a9 737
ea2384d3 738 filename = tmp_filename;
91a073a9 739 drv = bdrv_qcow2;
ea2384d3
FB
740 bs->is_temporary = 1;
741 }
712e7874 742
b6ce07aa 743 /* Find the right image format driver */
6db95603 744 if (!drv) {
c98ac35d 745 ret = find_image_format(filename, &drv);
51d7c00c 746 }
6987307c 747
51d7c00c 748 if (!drv) {
51d7c00c 749 goto unlink_and_fail;
ea2384d3 750 }
b6ce07aa
KW
751
752 /* Open the image */
753 ret = bdrv_open_common(bs, filename, flags, drv);
754 if (ret < 0) {
6987307c
CH
755 goto unlink_and_fail;
756 }
757
b6ce07aa
KW
758 /* If there is a backing file, use it */
759 if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
760 char backing_filename[PATH_MAX];
761 int back_flags;
762 BlockDriver *back_drv = NULL;
763
764 bs->backing_hd = bdrv_new("");
df2dbb4a
SH
765
766 if (path_has_protocol(bs->backing_file)) {
767 pstrcpy(backing_filename, sizeof(backing_filename),
768 bs->backing_file);
769 } else {
770 path_combine(backing_filename, sizeof(backing_filename),
771 filename, bs->backing_file);
772 }
773
774 if (bs->backing_format[0] != '\0') {
b6ce07aa 775 back_drv = bdrv_find_format(bs->backing_format);
df2dbb4a 776 }
b6ce07aa
KW
777
778 /* backing files always opened read-only */
779 back_flags =
780 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
781
782 ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
783 if (ret < 0) {
784 bdrv_close(bs);
785 return ret;
786 }
787 if (bs->is_temporary) {
788 bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
789 } else {
790 /* base image inherits from "parent" */
791 bs->backing_hd->keep_read_only = bs->keep_read_only;
792 }
793 }
794
795 if (!bdrv_key_required(bs)) {
7d4b4ba5 796 bdrv_dev_change_media_cb(bs, true);
b6ce07aa
KW
797 }
798
98f90dba
ZYW
799 /* throttling disk I/O limits */
800 if (bs->io_limits_enabled) {
801 bdrv_io_limits_enable(bs);
802 }
803
b6ce07aa
KW
804 return 0;
805
806unlink_and_fail:
807 if (bs->is_temporary) {
808 unlink(filename);
809 }
810 return ret;
811}
812
fc01f7e7
FB
813void bdrv_close(BlockDriverState *bs)
814{
19cb3738 815 if (bs->drv) {
3e914655
PB
816 if (bs->job) {
817 block_job_cancel_sync(bs->job);
818 }
f9092b10
MA
819 if (bs == bs_snapshots) {
820 bs_snapshots = NULL;
821 }
557df6ac 822 if (bs->backing_hd) {
ea2384d3 823 bdrv_delete(bs->backing_hd);
557df6ac
SH
824 bs->backing_hd = NULL;
825 }
ea2384d3 826 bs->drv->bdrv_close(bs);
7267c094 827 g_free(bs->opaque);
ea2384d3
FB
828#ifdef _WIN32
829 if (bs->is_temporary) {
830 unlink(bs->filename);
831 }
67b915a5 832#endif
ea2384d3
FB
833 bs->opaque = NULL;
834 bs->drv = NULL;
53fec9d3 835 bs->copy_on_read = 0;
b338082b 836
66f82cee
KW
837 if (bs->file != NULL) {
838 bdrv_close(bs->file);
839 }
840
7d4b4ba5 841 bdrv_dev_change_media_cb(bs, false);
b338082b 842 }
98f90dba
ZYW
843
844 /*throttling disk I/O limits*/
845 if (bs->io_limits_enabled) {
846 bdrv_io_limits_disable(bs);
847 }
b338082b
FB
848}
849
2bc93fed
MK
850void bdrv_close_all(void)
851{
852 BlockDriverState *bs;
853
854 QTAILQ_FOREACH(bs, &bdrv_states, list) {
855 bdrv_close(bs);
856 }
857}
858
922453bc
SH
859/*
860 * Wait for pending requests to complete across all BlockDriverStates
861 *
862 * This function does not flush data to disk, use bdrv_flush_all() for that
863 * after calling this function.
864 */
865void bdrv_drain_all(void)
866{
867 BlockDriverState *bs;
868
869 qemu_aio_flush();
870
871 /* If requests are still pending there is a bug somewhere */
872 QTAILQ_FOREACH(bs, &bdrv_states, list) {
873 assert(QLIST_EMPTY(&bs->tracked_requests));
874 assert(qemu_co_queue_empty(&bs->throttled_reqs));
875 }
876}
877
d22b2f41
RH
878/* make a BlockDriverState anonymous by removing from bdrv_state list.
879 Also, NULL terminate the device_name to prevent double remove */
880void bdrv_make_anon(BlockDriverState *bs)
881{
882 if (bs->device_name[0] != '\0') {
883 QTAILQ_REMOVE(&bdrv_states, bs, list);
884 }
885 bs->device_name[0] = '\0';
886}
887
8802d1fd
JC
888/*
889 * Add new bs contents at the top of an image chain while the chain is
890 * live, while keeping required fields on the top layer.
891 *
892 * This will modify the BlockDriverState fields, and swap contents
893 * between bs_new and bs_top. Both bs_new and bs_top are modified.
894 *
895 * This function does not create any image files.
896 */
897void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
898{
899 BlockDriverState tmp;
900
901 /* the new bs must not be in bdrv_states */
902 bdrv_make_anon(bs_new);
903
904 tmp = *bs_new;
905
906 /* there are some fields that need to stay on the top layer: */
907
908 /* dev info */
909 tmp.dev_ops = bs_top->dev_ops;
910 tmp.dev_opaque = bs_top->dev_opaque;
911 tmp.dev = bs_top->dev;
912 tmp.buffer_alignment = bs_top->buffer_alignment;
913 tmp.copy_on_read = bs_top->copy_on_read;
914
915 /* i/o timing parameters */
916 tmp.slice_time = bs_top->slice_time;
917 tmp.slice_start = bs_top->slice_start;
918 tmp.slice_end = bs_top->slice_end;
919 tmp.io_limits = bs_top->io_limits;
920 tmp.io_base = bs_top->io_base;
921 tmp.throttled_reqs = bs_top->throttled_reqs;
922 tmp.block_timer = bs_top->block_timer;
923 tmp.io_limits_enabled = bs_top->io_limits_enabled;
924
925 /* geometry */
926 tmp.cyls = bs_top->cyls;
927 tmp.heads = bs_top->heads;
928 tmp.secs = bs_top->secs;
929 tmp.translation = bs_top->translation;
930
931 /* r/w error */
932 tmp.on_read_error = bs_top->on_read_error;
933 tmp.on_write_error = bs_top->on_write_error;
934
935 /* i/o status */
936 tmp.iostatus_enabled = bs_top->iostatus_enabled;
937 tmp.iostatus = bs_top->iostatus;
938
939 /* keep the same entry in bdrv_states */
940 pstrcpy(tmp.device_name, sizeof(tmp.device_name), bs_top->device_name);
941 tmp.list = bs_top->list;
942
943 /* The contents of 'tmp' will become bs_top, as we are
944 * swapping bs_new and bs_top contents. */
945 tmp.backing_hd = bs_new;
946 pstrcpy(tmp.backing_file, sizeof(tmp.backing_file), bs_top->filename);
947
948 /* swap contents of the fixed new bs and the current top */
949 *bs_new = *bs_top;
950 *bs_top = tmp;
951
952 /* clear the copied fields in the new backing file */
953 bdrv_detach_dev(bs_new, bs_new->dev);
954
955 qemu_co_queue_init(&bs_new->throttled_reqs);
956 memset(&bs_new->io_base, 0, sizeof(bs_new->io_base));
957 memset(&bs_new->io_limits, 0, sizeof(bs_new->io_limits));
958 bdrv_iostatus_disable(bs_new);
959
960 /* we don't use bdrv_io_limits_disable() for this, because we don't want
961 * to affect or delete the block_timer, as it has been moved to bs_top */
962 bs_new->io_limits_enabled = false;
963 bs_new->block_timer = NULL;
964 bs_new->slice_time = 0;
965 bs_new->slice_start = 0;
966 bs_new->slice_end = 0;
967}
968
b338082b
FB
969void bdrv_delete(BlockDriverState *bs)
970{
fa879d62 971 assert(!bs->dev);
3e914655
PB
972 assert(!bs->job);
973 assert(!bs->in_use);
18846dee 974
1b7bdbc1 975 /* remove from list, if necessary */
d22b2f41 976 bdrv_make_anon(bs);
34c6f050 977
b338082b 978 bdrv_close(bs);
66f82cee
KW
979 if (bs->file != NULL) {
980 bdrv_delete(bs->file);
981 }
982
f9092b10 983 assert(bs != bs_snapshots);
7267c094 984 g_free(bs);
fc01f7e7
FB
985}
986
fa879d62
MA
987int bdrv_attach_dev(BlockDriverState *bs, void *dev)
988/* TODO change to DeviceState *dev when all users are qdevified */
18846dee 989{
fa879d62 990 if (bs->dev) {
18846dee
MA
991 return -EBUSY;
992 }
fa879d62 993 bs->dev = dev;
28a7282a 994 bdrv_iostatus_reset(bs);
18846dee
MA
995 return 0;
996}
997
fa879d62
MA
998/* TODO qdevified devices don't use this, remove when devices are qdevified */
999void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
18846dee 1000{
fa879d62
MA
1001 if (bdrv_attach_dev(bs, dev) < 0) {
1002 abort();
1003 }
1004}
1005
1006void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1007/* TODO change to DeviceState *dev when all users are qdevified */
1008{
1009 assert(bs->dev == dev);
1010 bs->dev = NULL;
0e49de52
MA
1011 bs->dev_ops = NULL;
1012 bs->dev_opaque = NULL;
29e05f20 1013 bs->buffer_alignment = 512;
18846dee
MA
1014}
1015
fa879d62
MA
1016/* TODO change to return DeviceState * when all users are qdevified */
1017void *bdrv_get_attached_dev(BlockDriverState *bs)
18846dee 1018{
fa879d62 1019 return bs->dev;
18846dee
MA
1020}
1021
0e49de52
MA
1022void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1023 void *opaque)
1024{
1025 bs->dev_ops = ops;
1026 bs->dev_opaque = opaque;
2c6942fa
MA
1027 if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1028 bs_snapshots = NULL;
1029 }
0e49de52
MA
1030}
1031
329c0a48
LC
1032void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1033 BlockQMPEventAction action, int is_read)
1034{
1035 QObject *data;
1036 const char *action_str;
1037
1038 switch (action) {
1039 case BDRV_ACTION_REPORT:
1040 action_str = "report";
1041 break;
1042 case BDRV_ACTION_IGNORE:
1043 action_str = "ignore";
1044 break;
1045 case BDRV_ACTION_STOP:
1046 action_str = "stop";
1047 break;
1048 default:
1049 abort();
1050 }
1051
1052 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1053 bdrv->device_name,
1054 action_str,
1055 is_read ? "read" : "write");
1056 monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1057
1058 qobject_decref(data);
1059}
1060
6f382ed2
LC
1061static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1062{
1063 QObject *data;
1064
1065 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1066 bdrv_get_device_name(bs), ejected);
1067 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1068
1069 qobject_decref(data);
1070}
1071
7d4b4ba5 1072static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
0e49de52 1073{
145feb17 1074 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
6f382ed2 1075 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
7d4b4ba5 1076 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
6f382ed2
LC
1077 if (tray_was_closed) {
1078 /* tray open */
1079 bdrv_emit_qmp_eject_event(bs, true);
1080 }
1081 if (load) {
1082 /* tray close */
1083 bdrv_emit_qmp_eject_event(bs, false);
1084 }
145feb17
MA
1085 }
1086}
1087
2c6942fa
MA
1088bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1089{
1090 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1091}
1092
025ccaa7
PB
1093void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1094{
1095 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1096 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1097 }
1098}
1099
e4def80b
MA
1100bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1101{
1102 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1103 return bs->dev_ops->is_tray_open(bs->dev_opaque);
1104 }
1105 return false;
1106}
1107
145feb17
MA
1108static void bdrv_dev_resize_cb(BlockDriverState *bs)
1109{
1110 if (bs->dev_ops && bs->dev_ops->resize_cb) {
1111 bs->dev_ops->resize_cb(bs->dev_opaque);
0e49de52
MA
1112 }
1113}
1114
f107639a
MA
1115bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1116{
1117 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1118 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1119 }
1120 return false;
1121}
1122
e97fc193
AL
1123/*
1124 * Run consistency checks on an image
1125 *
e076f338 1126 * Returns 0 if the check could be completed (it doesn't mean that the image is
a1c7273b 1127 * free of errors) or -errno when an internal error occurred. The results of the
e076f338 1128 * check are stored in res.
e97fc193 1129 */
e076f338 1130int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
e97fc193
AL
1131{
1132 if (bs->drv->bdrv_check == NULL) {
1133 return -ENOTSUP;
1134 }
1135
e076f338 1136 memset(res, 0, sizeof(*res));
9ac228e0 1137 return bs->drv->bdrv_check(bs, res);
e97fc193
AL
1138}
1139
8a426614
KW
1140#define COMMIT_BUF_SECTORS 2048
1141
33e3963e
FB
1142/* commit COW file into the raw image */
1143int bdrv_commit(BlockDriverState *bs)
1144{
19cb3738 1145 BlockDriver *drv = bs->drv;
ee181196 1146 BlockDriver *backing_drv;
8a426614
KW
1147 int64_t sector, total_sectors;
1148 int n, ro, open_flags;
4dca4b63 1149 int ret = 0, rw_ret = 0;
8a426614 1150 uint8_t *buf;
4dca4b63
NS
1151 char filename[1024];
1152 BlockDriverState *bs_rw, *bs_ro;
33e3963e 1153
19cb3738
FB
1154 if (!drv)
1155 return -ENOMEDIUM;
4dca4b63
NS
1156
1157 if (!bs->backing_hd) {
1158 return -ENOTSUP;
33e3963e
FB
1159 }
1160
4dca4b63
NS
1161 if (bs->backing_hd->keep_read_only) {
1162 return -EACCES;
1163 }
ee181196 1164
2d3735d3
SH
1165 if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1166 return -EBUSY;
1167 }
1168
ee181196 1169 backing_drv = bs->backing_hd->drv;
4dca4b63
NS
1170 ro = bs->backing_hd->read_only;
1171 strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1172 open_flags = bs->backing_hd->open_flags;
1173
1174 if (ro) {
1175 /* re-open as RW */
1176 bdrv_delete(bs->backing_hd);
1177 bs->backing_hd = NULL;
1178 bs_rw = bdrv_new("");
ee181196
KW
1179 rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1180 backing_drv);
4dca4b63
NS
1181 if (rw_ret < 0) {
1182 bdrv_delete(bs_rw);
1183 /* try to re-open read-only */
1184 bs_ro = bdrv_new("");
ee181196
KW
1185 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1186 backing_drv);
4dca4b63
NS
1187 if (ret < 0) {
1188 bdrv_delete(bs_ro);
1189 /* drive not functional anymore */
1190 bs->drv = NULL;
1191 return ret;
1192 }
1193 bs->backing_hd = bs_ro;
1194 return rw_ret;
1195 }
1196 bs->backing_hd = bs_rw;
ea2384d3 1197 }
33e3963e 1198
6ea44308 1199 total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
7267c094 1200 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
8a426614
KW
1201
1202 for (sector = 0; sector < total_sectors; sector += n) {
05c4af54 1203 if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
8a426614
KW
1204
1205 if (bdrv_read(bs, sector, buf, n) != 0) {
1206 ret = -EIO;
1207 goto ro_cleanup;
1208 }
1209
1210 if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1211 ret = -EIO;
1212 goto ro_cleanup;
1213 }
ea2384d3 1214 }
33e3963e 1215 }
95389c86 1216
1d44952f
CH
1217 if (drv->bdrv_make_empty) {
1218 ret = drv->bdrv_make_empty(bs);
1219 bdrv_flush(bs);
1220 }
95389c86 1221
3f5075ae
CH
1222 /*
1223 * Make sure all data we wrote to the backing device is actually
1224 * stable on disk.
1225 */
1226 if (bs->backing_hd)
1227 bdrv_flush(bs->backing_hd);
4dca4b63
NS
1228
1229ro_cleanup:
7267c094 1230 g_free(buf);
4dca4b63
NS
1231
1232 if (ro) {
1233 /* re-open as RO */
1234 bdrv_delete(bs->backing_hd);
1235 bs->backing_hd = NULL;
1236 bs_ro = bdrv_new("");
ee181196
KW
1237 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1238 backing_drv);
4dca4b63
NS
1239 if (ret < 0) {
1240 bdrv_delete(bs_ro);
1241 /* drive not functional anymore */
1242 bs->drv = NULL;
1243 return ret;
1244 }
1245 bs->backing_hd = bs_ro;
1246 bs->backing_hd->keep_read_only = 0;
1247 }
1248
1d44952f 1249 return ret;
33e3963e
FB
1250}
1251
e8877497 1252int bdrv_commit_all(void)
6ab4b5ab
MA
1253{
1254 BlockDriverState *bs;
1255
1256 QTAILQ_FOREACH(bs, &bdrv_states, list) {
e8877497
SH
1257 int ret = bdrv_commit(bs);
1258 if (ret < 0) {
1259 return ret;
1260 }
6ab4b5ab 1261 }
e8877497 1262 return 0;
6ab4b5ab
MA
1263}
1264
dbffbdcf
SH
1265struct BdrvTrackedRequest {
1266 BlockDriverState *bs;
1267 int64_t sector_num;
1268 int nb_sectors;
1269 bool is_write;
1270 QLIST_ENTRY(BdrvTrackedRequest) list;
5f8b6491 1271 Coroutine *co; /* owner, used for deadlock detection */
f4658285 1272 CoQueue wait_queue; /* coroutines blocked on this request */
dbffbdcf
SH
1273};
1274
1275/**
1276 * Remove an active request from the tracked requests list
1277 *
1278 * This function should be called when a tracked request is completing.
1279 */
1280static void tracked_request_end(BdrvTrackedRequest *req)
1281{
1282 QLIST_REMOVE(req, list);
f4658285 1283 qemu_co_queue_restart_all(&req->wait_queue);
dbffbdcf
SH
1284}
1285
1286/**
1287 * Add an active request to the tracked requests list
1288 */
1289static void tracked_request_begin(BdrvTrackedRequest *req,
1290 BlockDriverState *bs,
1291 int64_t sector_num,
1292 int nb_sectors, bool is_write)
1293{
1294 *req = (BdrvTrackedRequest){
1295 .bs = bs,
1296 .sector_num = sector_num,
1297 .nb_sectors = nb_sectors,
1298 .is_write = is_write,
5f8b6491 1299 .co = qemu_coroutine_self(),
dbffbdcf
SH
1300 };
1301
f4658285
SH
1302 qemu_co_queue_init(&req->wait_queue);
1303
dbffbdcf
SH
1304 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1305}
1306
d83947ac
SH
1307/**
1308 * Round a region to cluster boundaries
1309 */
1310static void round_to_clusters(BlockDriverState *bs,
1311 int64_t sector_num, int nb_sectors,
1312 int64_t *cluster_sector_num,
1313 int *cluster_nb_sectors)
1314{
1315 BlockDriverInfo bdi;
1316
1317 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1318 *cluster_sector_num = sector_num;
1319 *cluster_nb_sectors = nb_sectors;
1320 } else {
1321 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1322 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1323 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1324 nb_sectors, c);
1325 }
1326}
1327
f4658285
SH
1328static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1329 int64_t sector_num, int nb_sectors) {
d83947ac
SH
1330 /* aaaa bbbb */
1331 if (sector_num >= req->sector_num + req->nb_sectors) {
1332 return false;
1333 }
1334 /* bbbb aaaa */
1335 if (req->sector_num >= sector_num + nb_sectors) {
1336 return false;
1337 }
1338 return true;
f4658285
SH
1339}
1340
1341static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1342 int64_t sector_num, int nb_sectors)
1343{
1344 BdrvTrackedRequest *req;
d83947ac
SH
1345 int64_t cluster_sector_num;
1346 int cluster_nb_sectors;
f4658285
SH
1347 bool retry;
1348
d83947ac
SH
1349 /* If we touch the same cluster it counts as an overlap. This guarantees
1350 * that allocating writes will be serialized and not race with each other
1351 * for the same cluster. For example, in copy-on-read it ensures that the
1352 * CoR read and write operations are atomic and guest writes cannot
1353 * interleave between them.
1354 */
1355 round_to_clusters(bs, sector_num, nb_sectors,
1356 &cluster_sector_num, &cluster_nb_sectors);
1357
f4658285
SH
1358 do {
1359 retry = false;
1360 QLIST_FOREACH(req, &bs->tracked_requests, list) {
d83947ac
SH
1361 if (tracked_request_overlaps(req, cluster_sector_num,
1362 cluster_nb_sectors)) {
5f8b6491
SH
1363 /* Hitting this means there was a reentrant request, for
1364 * example, a block driver issuing nested requests. This must
1365 * never happen since it means deadlock.
1366 */
1367 assert(qemu_coroutine_self() != req->co);
1368
f4658285
SH
1369 qemu_co_queue_wait(&req->wait_queue);
1370 retry = true;
1371 break;
1372 }
1373 }
1374 } while (retry);
1375}
1376
756e6736
KW
1377/*
1378 * Return values:
1379 * 0 - success
1380 * -EINVAL - backing format specified, but no file
1381 * -ENOSPC - can't update the backing file because no space is left in the
1382 * image file header
1383 * -ENOTSUP - format driver doesn't support changing the backing file
1384 */
1385int bdrv_change_backing_file(BlockDriverState *bs,
1386 const char *backing_file, const char *backing_fmt)
1387{
1388 BlockDriver *drv = bs->drv;
1389
1390 if (drv->bdrv_change_backing_file != NULL) {
1391 return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1392 } else {
1393 return -ENOTSUP;
1394 }
1395}
1396
71d0770c
AL
1397static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1398 size_t size)
1399{
1400 int64_t len;
1401
1402 if (!bdrv_is_inserted(bs))
1403 return -ENOMEDIUM;
1404
1405 if (bs->growable)
1406 return 0;
1407
1408 len = bdrv_getlength(bs);
1409
fbb7b4e0
KW
1410 if (offset < 0)
1411 return -EIO;
1412
1413 if ((offset > len) || (len - offset < size))
71d0770c
AL
1414 return -EIO;
1415
1416 return 0;
1417}
1418
1419static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1420 int nb_sectors)
1421{
eb5a3165
JS
1422 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1423 nb_sectors * BDRV_SECTOR_SIZE);
71d0770c
AL
1424}
1425
1c9805a3
SH
1426typedef struct RwCo {
1427 BlockDriverState *bs;
1428 int64_t sector_num;
1429 int nb_sectors;
1430 QEMUIOVector *qiov;
1431 bool is_write;
1432 int ret;
1433} RwCo;
1434
1435static void coroutine_fn bdrv_rw_co_entry(void *opaque)
fc01f7e7 1436{
1c9805a3 1437 RwCo *rwco = opaque;
ea2384d3 1438
1c9805a3
SH
1439 if (!rwco->is_write) {
1440 rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
470c0504 1441 rwco->nb_sectors, rwco->qiov, 0);
1c9805a3
SH
1442 } else {
1443 rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
f08f2dda 1444 rwco->nb_sectors, rwco->qiov, 0);
1c9805a3
SH
1445 }
1446}
e7a8a783 1447
1c9805a3
SH
1448/*
1449 * Process a synchronous request using coroutines
1450 */
1451static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1452 int nb_sectors, bool is_write)
1453{
1454 QEMUIOVector qiov;
1455 struct iovec iov = {
1456 .iov_base = (void *)buf,
1457 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1458 };
1459 Coroutine *co;
1460 RwCo rwco = {
1461 .bs = bs,
1462 .sector_num = sector_num,
1463 .nb_sectors = nb_sectors,
1464 .qiov = &qiov,
1465 .is_write = is_write,
1466 .ret = NOT_DONE,
1467 };
e7a8a783 1468
1c9805a3 1469 qemu_iovec_init_external(&qiov, &iov, 1);
e7a8a783 1470
498e386c
ZYW
1471 /**
1472 * In sync call context, when the vcpu is blocked, this throttling timer
1473 * will not fire; so the I/O throttling function has to be disabled here
1474 * if it has been enabled.
1475 */
1476 if (bs->io_limits_enabled) {
1477 fprintf(stderr, "Disabling I/O throttling on '%s' due "
1478 "to synchronous I/O.\n", bdrv_get_device_name(bs));
1479 bdrv_io_limits_disable(bs);
1480 }
1481
1c9805a3
SH
1482 if (qemu_in_coroutine()) {
1483 /* Fast-path if already in coroutine context */
1484 bdrv_rw_co_entry(&rwco);
1485 } else {
1486 co = qemu_coroutine_create(bdrv_rw_co_entry);
1487 qemu_coroutine_enter(co, &rwco);
1488 while (rwco.ret == NOT_DONE) {
1489 qemu_aio_wait();
1490 }
1491 }
1492 return rwco.ret;
1493}
b338082b 1494
1c9805a3
SH
1495/* return < 0 if error. See bdrv_write() for the return codes */
1496int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1497 uint8_t *buf, int nb_sectors)
1498{
1499 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
fc01f7e7
FB
1500}
1501
7cd1e32a 1502static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
a55eb92c 1503 int nb_sectors, int dirty)
7cd1e32a 1504{
1505 int64_t start, end;
c6d22830 1506 unsigned long val, idx, bit;
a55eb92c 1507
6ea44308 1508 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
c6d22830 1509 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
a55eb92c
JK
1510
1511 for (; start <= end; start++) {
c6d22830
JK
1512 idx = start / (sizeof(unsigned long) * 8);
1513 bit = start % (sizeof(unsigned long) * 8);
1514 val = bs->dirty_bitmap[idx];
1515 if (dirty) {
6d59fec1 1516 if (!(val & (1UL << bit))) {
aaa0eb75 1517 bs->dirty_count++;
6d59fec1 1518 val |= 1UL << bit;
aaa0eb75 1519 }
c6d22830 1520 } else {
6d59fec1 1521 if (val & (1UL << bit)) {
aaa0eb75 1522 bs->dirty_count--;
6d59fec1 1523 val &= ~(1UL << bit);
aaa0eb75 1524 }
c6d22830
JK
1525 }
1526 bs->dirty_bitmap[idx] = val;
7cd1e32a 1527 }
1528}
1529
5fafdf24 1530/* Return < 0 if error. Important errors are:
19cb3738
FB
1531 -EIO generic I/O error (may happen for all errors)
1532 -ENOMEDIUM No media inserted.
1533 -EINVAL Invalid sector number or nb_sectors
1534 -EACCES Trying to write a read-only device
1535*/
5fafdf24 1536int bdrv_write(BlockDriverState *bs, int64_t sector_num,
fc01f7e7
FB
1537 const uint8_t *buf, int nb_sectors)
1538{
1c9805a3 1539 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
83f64091
FB
1540}
1541
eda578e5
AL
1542int bdrv_pread(BlockDriverState *bs, int64_t offset,
1543 void *buf, int count1)
83f64091 1544{
6ea44308 1545 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
83f64091
FB
1546 int len, nb_sectors, count;
1547 int64_t sector_num;
9a8c4cce 1548 int ret;
83f64091
FB
1549
1550 count = count1;
1551 /* first read to align to sector start */
6ea44308 1552 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
83f64091
FB
1553 if (len > count)
1554 len = count;
6ea44308 1555 sector_num = offset >> BDRV_SECTOR_BITS;
83f64091 1556 if (len > 0) {
9a8c4cce
KW
1557 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1558 return ret;
6ea44308 1559 memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
83f64091
FB
1560 count -= len;
1561 if (count == 0)
1562 return count1;
1563 sector_num++;
1564 buf += len;
1565 }
1566
1567 /* read the sectors "in place" */
6ea44308 1568 nb_sectors = count >> BDRV_SECTOR_BITS;
83f64091 1569 if (nb_sectors > 0) {
9a8c4cce
KW
1570 if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1571 return ret;
83f64091 1572 sector_num += nb_sectors;
6ea44308 1573 len = nb_sectors << BDRV_SECTOR_BITS;
83f64091
FB
1574 buf += len;
1575 count -= len;
1576 }
1577
1578 /* add data from the last sector */
1579 if (count > 0) {
9a8c4cce
KW
1580 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1581 return ret;
83f64091
FB
1582 memcpy(buf, tmp_buf, count);
1583 }
1584 return count1;
1585}
1586
eda578e5
AL
1587int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1588 const void *buf, int count1)
83f64091 1589{
6ea44308 1590 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
83f64091
FB
1591 int len, nb_sectors, count;
1592 int64_t sector_num;
9a8c4cce 1593 int ret;
83f64091
FB
1594
1595 count = count1;
1596 /* first write to align to sector start */
6ea44308 1597 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
83f64091
FB
1598 if (len > count)
1599 len = count;
6ea44308 1600 sector_num = offset >> BDRV_SECTOR_BITS;
83f64091 1601 if (len > 0) {
9a8c4cce
KW
1602 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1603 return ret;
6ea44308 1604 memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
9a8c4cce
KW
1605 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1606 return ret;
83f64091
FB
1607 count -= len;
1608 if (count == 0)
1609 return count1;
1610 sector_num++;
1611 buf += len;
1612 }
1613
1614 /* write the sectors "in place" */
6ea44308 1615 nb_sectors = count >> BDRV_SECTOR_BITS;
83f64091 1616 if (nb_sectors > 0) {
9a8c4cce
KW
1617 if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1618 return ret;
83f64091 1619 sector_num += nb_sectors;
6ea44308 1620 len = nb_sectors << BDRV_SECTOR_BITS;
83f64091
FB
1621 buf += len;
1622 count -= len;
1623 }
1624
1625 /* add data from the last sector */
1626 if (count > 0) {
9a8c4cce
KW
1627 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1628 return ret;
83f64091 1629 memcpy(tmp_buf, buf, count);
9a8c4cce
KW
1630 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1631 return ret;
83f64091
FB
1632 }
1633 return count1;
1634}
83f64091 1635
f08145fe
KW
1636/*
1637 * Writes to the file and ensures that no writes are reordered across this
1638 * request (acts as a barrier)
1639 *
1640 * Returns 0 on success, -errno in error cases.
1641 */
1642int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1643 const void *buf, int count)
1644{
1645 int ret;
1646
1647 ret = bdrv_pwrite(bs, offset, buf, count);
1648 if (ret < 0) {
1649 return ret;
1650 }
1651
92196b2f
SH
1652 /* No flush needed for cache modes that use O_DSYNC */
1653 if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
f08145fe
KW
1654 bdrv_flush(bs);
1655 }
1656
1657 return 0;
1658}
1659
470c0504 1660static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
ab185921
SH
1661 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1662{
1663 /* Perform I/O through a temporary buffer so that users who scribble over
1664 * their read buffer while the operation is in progress do not end up
1665 * modifying the image file. This is critical for zero-copy guest I/O
1666 * where anything might happen inside guest memory.
1667 */
1668 void *bounce_buffer;
1669
79c053bd 1670 BlockDriver *drv = bs->drv;
ab185921
SH
1671 struct iovec iov;
1672 QEMUIOVector bounce_qiov;
1673 int64_t cluster_sector_num;
1674 int cluster_nb_sectors;
1675 size_t skip_bytes;
1676 int ret;
1677
1678 /* Cover entire cluster so no additional backing file I/O is required when
1679 * allocating cluster in the image file.
1680 */
1681 round_to_clusters(bs, sector_num, nb_sectors,
1682 &cluster_sector_num, &cluster_nb_sectors);
1683
470c0504
SH
1684 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1685 cluster_sector_num, cluster_nb_sectors);
ab185921
SH
1686
1687 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1688 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1689 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1690
79c053bd
SH
1691 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1692 &bounce_qiov);
ab185921
SH
1693 if (ret < 0) {
1694 goto err;
1695 }
1696
79c053bd
SH
1697 if (drv->bdrv_co_write_zeroes &&
1698 buffer_is_zero(bounce_buffer, iov.iov_len)) {
1699 ret = drv->bdrv_co_write_zeroes(bs, cluster_sector_num,
1700 cluster_nb_sectors);
1701 } else {
1702 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
ab185921 1703 &bounce_qiov);
79c053bd
SH
1704 }
1705
ab185921
SH
1706 if (ret < 0) {
1707 /* It might be okay to ignore write errors for guest requests. If this
1708 * is a deliberate copy-on-read then we don't want to ignore the error.
1709 * Simply report it in all cases.
1710 */
1711 goto err;
1712 }
1713
1714 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1715 qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
1716 nb_sectors * BDRV_SECTOR_SIZE);
1717
1718err:
1719 qemu_vfree(bounce_buffer);
1720 return ret;
1721}
1722
c5fbe571
SH
1723/*
1724 * Handle a read request in coroutine context
1725 */
1726static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
470c0504
SH
1727 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1728 BdrvRequestFlags flags)
da1fa91d
KW
1729{
1730 BlockDriver *drv = bs->drv;
dbffbdcf
SH
1731 BdrvTrackedRequest req;
1732 int ret;
da1fa91d 1733
da1fa91d
KW
1734 if (!drv) {
1735 return -ENOMEDIUM;
1736 }
1737 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1738 return -EIO;
1739 }
1740
98f90dba
ZYW
1741 /* throttling disk read I/O */
1742 if (bs->io_limits_enabled) {
1743 bdrv_io_limits_intercept(bs, false, nb_sectors);
1744 }
1745
f4658285 1746 if (bs->copy_on_read) {
470c0504
SH
1747 flags |= BDRV_REQ_COPY_ON_READ;
1748 }
1749 if (flags & BDRV_REQ_COPY_ON_READ) {
1750 bs->copy_on_read_in_flight++;
1751 }
1752
1753 if (bs->copy_on_read_in_flight) {
f4658285
SH
1754 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1755 }
1756
dbffbdcf 1757 tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
ab185921 1758
470c0504 1759 if (flags & BDRV_REQ_COPY_ON_READ) {
ab185921
SH
1760 int pnum;
1761
1762 ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1763 if (ret < 0) {
1764 goto out;
1765 }
1766
1767 if (!ret || pnum != nb_sectors) {
470c0504 1768 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
ab185921
SH
1769 goto out;
1770 }
1771 }
1772
dbffbdcf 1773 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
ab185921
SH
1774
1775out:
dbffbdcf 1776 tracked_request_end(&req);
470c0504
SH
1777
1778 if (flags & BDRV_REQ_COPY_ON_READ) {
1779 bs->copy_on_read_in_flight--;
1780 }
1781
dbffbdcf 1782 return ret;
da1fa91d
KW
1783}
1784
c5fbe571 1785int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
da1fa91d
KW
1786 int nb_sectors, QEMUIOVector *qiov)
1787{
c5fbe571 1788 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
da1fa91d 1789
470c0504
SH
1790 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1791}
1792
1793int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1794 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1795{
1796 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1797
1798 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1799 BDRV_REQ_COPY_ON_READ);
c5fbe571
SH
1800}
1801
f08f2dda
SH
1802static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1803 int64_t sector_num, int nb_sectors)
1804{
1805 BlockDriver *drv = bs->drv;
1806 QEMUIOVector qiov;
1807 struct iovec iov;
1808 int ret;
1809
1810 /* First try the efficient write zeroes operation */
1811 if (drv->bdrv_co_write_zeroes) {
1812 return drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1813 }
1814
1815 /* Fall back to bounce buffer if write zeroes is unsupported */
1816 iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
1817 iov.iov_base = qemu_blockalign(bs, iov.iov_len);
1818 memset(iov.iov_base, 0, iov.iov_len);
1819 qemu_iovec_init_external(&qiov, &iov, 1);
1820
1821 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1822
1823 qemu_vfree(iov.iov_base);
1824 return ret;
1825}
1826
c5fbe571
SH
1827/*
1828 * Handle a write request in coroutine context
1829 */
1830static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
f08f2dda
SH
1831 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1832 BdrvRequestFlags flags)
c5fbe571
SH
1833{
1834 BlockDriver *drv = bs->drv;
dbffbdcf 1835 BdrvTrackedRequest req;
6b7cb247 1836 int ret;
da1fa91d
KW
1837
1838 if (!bs->drv) {
1839 return -ENOMEDIUM;
1840 }
1841 if (bs->read_only) {
1842 return -EACCES;
1843 }
1844 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1845 return -EIO;
1846 }
1847
98f90dba
ZYW
1848 /* throttling disk write I/O */
1849 if (bs->io_limits_enabled) {
1850 bdrv_io_limits_intercept(bs, true, nb_sectors);
1851 }
1852
470c0504 1853 if (bs->copy_on_read_in_flight) {
f4658285
SH
1854 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1855 }
1856
dbffbdcf
SH
1857 tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
1858
f08f2dda
SH
1859 if (flags & BDRV_REQ_ZERO_WRITE) {
1860 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
1861 } else {
1862 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1863 }
6b7cb247 1864
da1fa91d
KW
1865 if (bs->dirty_bitmap) {
1866 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1867 }
1868
1869 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1870 bs->wr_highest_sector = sector_num + nb_sectors - 1;
1871 }
1872
dbffbdcf
SH
1873 tracked_request_end(&req);
1874
6b7cb247 1875 return ret;
da1fa91d
KW
1876}
1877
c5fbe571
SH
1878int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1879 int nb_sectors, QEMUIOVector *qiov)
1880{
1881 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1882
f08f2dda
SH
1883 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
1884}
1885
1886int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
1887 int64_t sector_num, int nb_sectors)
1888{
1889 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1890
1891 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
1892 BDRV_REQ_ZERO_WRITE);
c5fbe571
SH
1893}
1894
83f64091
FB
1895/**
1896 * Truncate file to 'offset' bytes (needed only for file protocols)
1897 */
1898int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1899{
1900 BlockDriver *drv = bs->drv;
51762288 1901 int ret;
83f64091 1902 if (!drv)
19cb3738 1903 return -ENOMEDIUM;
83f64091
FB
1904 if (!drv->bdrv_truncate)
1905 return -ENOTSUP;
59f2689d
NS
1906 if (bs->read_only)
1907 return -EACCES;
8591675f
MT
1908 if (bdrv_in_use(bs))
1909 return -EBUSY;
51762288
SH
1910 ret = drv->bdrv_truncate(bs, offset);
1911 if (ret == 0) {
1912 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
145feb17 1913 bdrv_dev_resize_cb(bs);
51762288
SH
1914 }
1915 return ret;
83f64091
FB
1916}
1917
4a1d5e1f
FZ
1918/**
1919 * Length of a allocated file in bytes. Sparse files are counted by actual
1920 * allocated space. Return < 0 if error or unknown.
1921 */
1922int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
1923{
1924 BlockDriver *drv = bs->drv;
1925 if (!drv) {
1926 return -ENOMEDIUM;
1927 }
1928 if (drv->bdrv_get_allocated_file_size) {
1929 return drv->bdrv_get_allocated_file_size(bs);
1930 }
1931 if (bs->file) {
1932 return bdrv_get_allocated_file_size(bs->file);
1933 }
1934 return -ENOTSUP;
1935}
1936
83f64091
FB
1937/**
1938 * Length of a file in bytes. Return < 0 if error or unknown.
1939 */
1940int64_t bdrv_getlength(BlockDriverState *bs)
1941{
1942 BlockDriver *drv = bs->drv;
1943 if (!drv)
19cb3738 1944 return -ENOMEDIUM;
51762288 1945
2c6942fa 1946 if (bs->growable || bdrv_dev_has_removable_media(bs)) {
46a4e4e6
SH
1947 if (drv->bdrv_getlength) {
1948 return drv->bdrv_getlength(bs);
1949 }
83f64091 1950 }
46a4e4e6 1951 return bs->total_sectors * BDRV_SECTOR_SIZE;
fc01f7e7
FB
1952}
1953
19cb3738 1954/* return 0 as number of sectors if no device present or error */
96b8f136 1955void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
fc01f7e7 1956{
19cb3738
FB
1957 int64_t length;
1958 length = bdrv_getlength(bs);
1959 if (length < 0)
1960 length = 0;
1961 else
6ea44308 1962 length = length >> BDRV_SECTOR_BITS;
19cb3738 1963 *nb_sectors_ptr = length;
fc01f7e7 1964}
cf98951b 1965
f3d54fc4
AL
1966struct partition {
1967 uint8_t boot_ind; /* 0x80 - active */
1968 uint8_t head; /* starting head */
1969 uint8_t sector; /* starting sector */
1970 uint8_t cyl; /* starting cylinder */
1971 uint8_t sys_ind; /* What partition type */
1972 uint8_t end_head; /* end head */
1973 uint8_t end_sector; /* end sector */
1974 uint8_t end_cyl; /* end cylinder */
1975 uint32_t start_sect; /* starting sector counting from 0 */
1976 uint32_t nr_sects; /* nr of sectors in partition */
541dc0d4 1977} QEMU_PACKED;
f3d54fc4
AL
1978
1979/* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
1980static int guess_disk_lchs(BlockDriverState *bs,
1981 int *pcylinders, int *pheads, int *psectors)
1982{
eb5a3165 1983 uint8_t buf[BDRV_SECTOR_SIZE];
f3d54fc4
AL
1984 int ret, i, heads, sectors, cylinders;
1985 struct partition *p;
1986 uint32_t nr_sects;
a38131b6 1987 uint64_t nb_sectors;
498e386c 1988 bool enabled;
f3d54fc4
AL
1989
1990 bdrv_get_geometry(bs, &nb_sectors);
1991
498e386c
ZYW
1992 /**
1993 * The function will be invoked during startup not only in sync I/O mode,
1994 * but also in async I/O mode. So the I/O throttling function has to
1995 * be disabled temporarily here, not permanently.
1996 */
1997 enabled = bs->io_limits_enabled;
1998 bs->io_limits_enabled = false;
f3d54fc4 1999 ret = bdrv_read(bs, 0, buf, 1);
498e386c 2000 bs->io_limits_enabled = enabled;
f3d54fc4
AL
2001 if (ret < 0)
2002 return -1;
2003 /* test msdos magic */
2004 if (buf[510] != 0x55 || buf[511] != 0xaa)
2005 return -1;
2006 for(i = 0; i < 4; i++) {
2007 p = ((struct partition *)(buf + 0x1be)) + i;
2008 nr_sects = le32_to_cpu(p->nr_sects);
2009 if (nr_sects && p->end_head) {
2010 /* We make the assumption that the partition terminates on
2011 a cylinder boundary */
2012 heads = p->end_head + 1;
2013 sectors = p->end_sector & 63;
2014 if (sectors == 0)
2015 continue;
2016 cylinders = nb_sectors / (heads * sectors);
2017 if (cylinders < 1 || cylinders > 16383)
2018 continue;
2019 *pheads = heads;
2020 *psectors = sectors;
2021 *pcylinders = cylinders;
2022#if 0
2023 printf("guessed geometry: LCHS=%d %d %d\n",
2024 cylinders, heads, sectors);
2025#endif
2026 return 0;
2027 }
2028 }
2029 return -1;
2030}
2031
2032void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
2033{
2034 int translation, lba_detected = 0;
2035 int cylinders, heads, secs;
a38131b6 2036 uint64_t nb_sectors;
f3d54fc4
AL
2037
2038 /* if a geometry hint is available, use it */
2039 bdrv_get_geometry(bs, &nb_sectors);
2040 bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
2041 translation = bdrv_get_translation_hint(bs);
2042 if (cylinders != 0) {
2043 *pcyls = cylinders;
2044 *pheads = heads;
2045 *psecs = secs;
2046 } else {
2047 if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
2048 if (heads > 16) {
2049 /* if heads > 16, it means that a BIOS LBA
2050 translation was active, so the default
2051 hardware geometry is OK */
2052 lba_detected = 1;
2053 goto default_geometry;
2054 } else {
2055 *pcyls = cylinders;
2056 *pheads = heads;
2057 *psecs = secs;
2058 /* disable any translation to be in sync with
2059 the logical geometry */
2060 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
2061 bdrv_set_translation_hint(bs,
2062 BIOS_ATA_TRANSLATION_NONE);
2063 }
2064 }
2065 } else {
2066 default_geometry:
2067 /* if no geometry, use a standard physical disk geometry */
2068 cylinders = nb_sectors / (16 * 63);
2069
2070 if (cylinders > 16383)
2071 cylinders = 16383;
2072 else if (cylinders < 2)
2073 cylinders = 2;
2074 *pcyls = cylinders;
2075 *pheads = 16;
2076 *psecs = 63;
2077 if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
2078 if ((*pcyls * *pheads) <= 131072) {
2079 bdrv_set_translation_hint(bs,
2080 BIOS_ATA_TRANSLATION_LARGE);
2081 } else {
2082 bdrv_set_translation_hint(bs,
2083 BIOS_ATA_TRANSLATION_LBA);
2084 }
2085 }
2086 }
2087 bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
2088 }
2089}
2090
5fafdf24 2091void bdrv_set_geometry_hint(BlockDriverState *bs,
b338082b
FB
2092 int cyls, int heads, int secs)
2093{
2094 bs->cyls = cyls;
2095 bs->heads = heads;
2096 bs->secs = secs;
2097}
2098
46d4767d
FB
2099void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
2100{
2101 bs->translation = translation;
2102}
2103
5fafdf24 2104void bdrv_get_geometry_hint(BlockDriverState *bs,
b338082b
FB
2105 int *pcyls, int *pheads, int *psecs)
2106{
2107 *pcyls = bs->cyls;
2108 *pheads = bs->heads;
2109 *psecs = bs->secs;
2110}
2111
0563e191
ZYW
2112/* throttling disk io limits */
2113void bdrv_set_io_limits(BlockDriverState *bs,
2114 BlockIOLimit *io_limits)
2115{
2116 bs->io_limits = *io_limits;
2117 bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2118}
2119
5bbdbb46
BS
2120/* Recognize floppy formats */
2121typedef struct FDFormat {
2122 FDriveType drive;
2123 uint8_t last_sect;
2124 uint8_t max_track;
2125 uint8_t max_head;
f8d3d128 2126 FDriveRate rate;
5bbdbb46
BS
2127} FDFormat;
2128
2129static const FDFormat fd_formats[] = {
2130 /* First entry is default format */
2131 /* 1.44 MB 3"1/2 floppy disks */
f8d3d128
HP
2132 { FDRIVE_DRV_144, 18, 80, 1, FDRIVE_RATE_500K, },
2133 { FDRIVE_DRV_144, 20, 80, 1, FDRIVE_RATE_500K, },
2134 { FDRIVE_DRV_144, 21, 80, 1, FDRIVE_RATE_500K, },
2135 { FDRIVE_DRV_144, 21, 82, 1, FDRIVE_RATE_500K, },
2136 { FDRIVE_DRV_144, 21, 83, 1, FDRIVE_RATE_500K, },
2137 { FDRIVE_DRV_144, 22, 80, 1, FDRIVE_RATE_500K, },
2138 { FDRIVE_DRV_144, 23, 80, 1, FDRIVE_RATE_500K, },
2139 { FDRIVE_DRV_144, 24, 80, 1, FDRIVE_RATE_500K, },
5bbdbb46 2140 /* 2.88 MB 3"1/2 floppy disks */
f8d3d128
HP
2141 { FDRIVE_DRV_288, 36, 80, 1, FDRIVE_RATE_1M, },
2142 { FDRIVE_DRV_288, 39, 80, 1, FDRIVE_RATE_1M, },
2143 { FDRIVE_DRV_288, 40, 80, 1, FDRIVE_RATE_1M, },
2144 { FDRIVE_DRV_288, 44, 80, 1, FDRIVE_RATE_1M, },
2145 { FDRIVE_DRV_288, 48, 80, 1, FDRIVE_RATE_1M, },
5bbdbb46 2146 /* 720 kB 3"1/2 floppy disks */
f8d3d128
HP
2147 { FDRIVE_DRV_144, 9, 80, 1, FDRIVE_RATE_250K, },
2148 { FDRIVE_DRV_144, 10, 80, 1, FDRIVE_RATE_250K, },
2149 { FDRIVE_DRV_144, 10, 82, 1, FDRIVE_RATE_250K, },
2150 { FDRIVE_DRV_144, 10, 83, 1, FDRIVE_RATE_250K, },
2151 { FDRIVE_DRV_144, 13, 80, 1, FDRIVE_RATE_250K, },
2152 { FDRIVE_DRV_144, 14, 80, 1, FDRIVE_RATE_250K, },
5bbdbb46 2153 /* 1.2 MB 5"1/4 floppy disks */
f8d3d128
HP
2154 { FDRIVE_DRV_120, 15, 80, 1, FDRIVE_RATE_500K, },
2155 { FDRIVE_DRV_120, 18, 80, 1, FDRIVE_RATE_500K, },
2156 { FDRIVE_DRV_120, 18, 82, 1, FDRIVE_RATE_500K, },
2157 { FDRIVE_DRV_120, 18, 83, 1, FDRIVE_RATE_500K, },
2158 { FDRIVE_DRV_120, 20, 80, 1, FDRIVE_RATE_500K, },
5bbdbb46 2159 /* 720 kB 5"1/4 floppy disks */
f8d3d128
HP
2160 { FDRIVE_DRV_120, 9, 80, 1, FDRIVE_RATE_250K, },
2161 { FDRIVE_DRV_120, 11, 80, 1, FDRIVE_RATE_250K, },
5bbdbb46 2162 /* 360 kB 5"1/4 floppy disks */
f8d3d128
HP
2163 { FDRIVE_DRV_120, 9, 40, 1, FDRIVE_RATE_300K, },
2164 { FDRIVE_DRV_120, 9, 40, 0, FDRIVE_RATE_300K, },
2165 { FDRIVE_DRV_120, 10, 41, 1, FDRIVE_RATE_300K, },
2166 { FDRIVE_DRV_120, 10, 42, 1, FDRIVE_RATE_300K, },
5bbdbb46 2167 /* 320 kB 5"1/4 floppy disks */
f8d3d128
HP
2168 { FDRIVE_DRV_120, 8, 40, 1, FDRIVE_RATE_250K, },
2169 { FDRIVE_DRV_120, 8, 40, 0, FDRIVE_RATE_250K, },
5bbdbb46 2170 /* 360 kB must match 5"1/4 better than 3"1/2... */
f8d3d128 2171 { FDRIVE_DRV_144, 9, 80, 0, FDRIVE_RATE_250K, },
5bbdbb46 2172 /* end */
f8d3d128 2173 { FDRIVE_DRV_NONE, -1, -1, 0, 0, },
5bbdbb46
BS
2174};
2175
2176void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
2177 int *max_track, int *last_sect,
f8d3d128
HP
2178 FDriveType drive_in, FDriveType *drive,
2179 FDriveRate *rate)
5bbdbb46
BS
2180{
2181 const FDFormat *parse;
2182 uint64_t nb_sectors, size;
2183 int i, first_match, match;
2184
2185 bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
2186 if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
2187 /* User defined disk */
f8d3d128 2188 *rate = FDRIVE_RATE_500K;
5bbdbb46
BS
2189 } else {
2190 bdrv_get_geometry(bs, &nb_sectors);
2191 match = -1;
2192 first_match = -1;
2193 for (i = 0; ; i++) {
2194 parse = &fd_formats[i];
2195 if (parse->drive == FDRIVE_DRV_NONE) {
2196 break;
2197 }
2198 if (drive_in == parse->drive ||
2199 drive_in == FDRIVE_DRV_NONE) {
2200 size = (parse->max_head + 1) * parse->max_track *
2201 parse->last_sect;
2202 if (nb_sectors == size) {
2203 match = i;
2204 break;
2205 }
2206 if (first_match == -1) {
2207 first_match = i;
2208 }
2209 }
2210 }
2211 if (match == -1) {
2212 if (first_match == -1) {
2213 match = 1;
2214 } else {
2215 match = first_match;
2216 }
2217 parse = &fd_formats[match];
2218 }
2219 *nb_heads = parse->max_head + 1;
2220 *max_track = parse->max_track;
2221 *last_sect = parse->last_sect;
2222 *drive = parse->drive;
f8d3d128 2223 *rate = parse->rate;
5bbdbb46
BS
2224 }
2225}
2226
46d4767d
FB
2227int bdrv_get_translation_hint(BlockDriverState *bs)
2228{
2229 return bs->translation;
2230}
2231
abd7f68d
MA
2232void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2233 BlockErrorAction on_write_error)
2234{
2235 bs->on_read_error = on_read_error;
2236 bs->on_write_error = on_write_error;
2237}
2238
2239BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2240{
2241 return is_read ? bs->on_read_error : bs->on_write_error;
2242}
2243
b338082b
FB
2244int bdrv_is_read_only(BlockDriverState *bs)
2245{
2246 return bs->read_only;
2247}
2248
985a03b0
TS
2249int bdrv_is_sg(BlockDriverState *bs)
2250{
2251 return bs->sg;
2252}
2253
e900a7b7
CH
2254int bdrv_enable_write_cache(BlockDriverState *bs)
2255{
2256 return bs->enable_write_cache;
2257}
2258
ea2384d3
FB
2259int bdrv_is_encrypted(BlockDriverState *bs)
2260{
2261 if (bs->backing_hd && bs->backing_hd->encrypted)
2262 return 1;
2263 return bs->encrypted;
2264}
2265
c0f4ce77
AL
2266int bdrv_key_required(BlockDriverState *bs)
2267{
2268 BlockDriverState *backing_hd = bs->backing_hd;
2269
2270 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2271 return 1;
2272 return (bs->encrypted && !bs->valid_key);
2273}
2274
ea2384d3
FB
2275int bdrv_set_key(BlockDriverState *bs, const char *key)
2276{
2277 int ret;
2278 if (bs->backing_hd && bs->backing_hd->encrypted) {
2279 ret = bdrv_set_key(bs->backing_hd, key);
2280 if (ret < 0)
2281 return ret;
2282 if (!bs->encrypted)
2283 return 0;
2284 }
fd04a2ae
SH
2285 if (!bs->encrypted) {
2286 return -EINVAL;
2287 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2288 return -ENOMEDIUM;
2289 }
c0f4ce77 2290 ret = bs->drv->bdrv_set_key(bs, key);
bb5fc20f
AL
2291 if (ret < 0) {
2292 bs->valid_key = 0;
2293 } else if (!bs->valid_key) {
2294 bs->valid_key = 1;
2295 /* call the change callback now, we skipped it on open */
7d4b4ba5 2296 bdrv_dev_change_media_cb(bs, true);
bb5fc20f 2297 }
c0f4ce77 2298 return ret;
ea2384d3
FB
2299}
2300
2301void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
2302{
19cb3738 2303 if (!bs->drv) {
ea2384d3
FB
2304 buf[0] = '\0';
2305 } else {
2306 pstrcpy(buf, buf_size, bs->drv->format_name);
2307 }
2308}
2309
5fafdf24 2310void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
ea2384d3
FB
2311 void *opaque)
2312{
2313 BlockDriver *drv;
2314
8a22f02a 2315 QLIST_FOREACH(drv, &bdrv_drivers, list) {
ea2384d3
FB
2316 it(opaque, drv->format_name);
2317 }
2318}
2319
b338082b
FB
2320BlockDriverState *bdrv_find(const char *name)
2321{
2322 BlockDriverState *bs;
2323
1b7bdbc1
SH
2324 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2325 if (!strcmp(name, bs->device_name)) {
b338082b 2326 return bs;
1b7bdbc1 2327 }
b338082b
FB
2328 }
2329 return NULL;
2330}
2331
2f399b0a
MA
2332BlockDriverState *bdrv_next(BlockDriverState *bs)
2333{
2334 if (!bs) {
2335 return QTAILQ_FIRST(&bdrv_states);
2336 }
2337 return QTAILQ_NEXT(bs, list);
2338}
2339
51de9760 2340void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
81d0912d
FB
2341{
2342 BlockDriverState *bs;
2343
1b7bdbc1 2344 QTAILQ_FOREACH(bs, &bdrv_states, list) {
51de9760 2345 it(opaque, bs);
81d0912d
FB
2346 }
2347}
2348
ea2384d3
FB
2349const char *bdrv_get_device_name(BlockDriverState *bs)
2350{
2351 return bs->device_name;
2352}
2353
c6ca28d6
AL
2354void bdrv_flush_all(void)
2355{
2356 BlockDriverState *bs;
2357
1b7bdbc1 2358 QTAILQ_FOREACH(bs, &bdrv_states, list) {
29cdb251 2359 bdrv_flush(bs);
1b7bdbc1 2360 }
c6ca28d6
AL
2361}
2362
f2feebbd
KW
2363int bdrv_has_zero_init(BlockDriverState *bs)
2364{
2365 assert(bs->drv);
2366
336c1c12
KW
2367 if (bs->drv->bdrv_has_zero_init) {
2368 return bs->drv->bdrv_has_zero_init(bs);
f2feebbd
KW
2369 }
2370
2371 return 1;
2372}
2373
376ae3f1
SH
2374typedef struct BdrvCoIsAllocatedData {
2375 BlockDriverState *bs;
2376 int64_t sector_num;
2377 int nb_sectors;
2378 int *pnum;
2379 int ret;
2380 bool done;
2381} BdrvCoIsAllocatedData;
2382
f58c7b35
TS
2383/*
2384 * Returns true iff the specified sector is present in the disk image. Drivers
2385 * not implementing the functionality are assumed to not support backing files,
2386 * hence all their sectors are reported as allocated.
2387 *
bd9533e3
SH
2388 * If 'sector_num' is beyond the end of the disk image the return value is 0
2389 * and 'pnum' is set to 0.
2390 *
f58c7b35
TS
2391 * 'pnum' is set to the number of sectors (including and immediately following
2392 * the specified sector) that are known to be in the same
2393 * allocated/unallocated state.
2394 *
bd9533e3
SH
2395 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
2396 * beyond the end of the disk image it will be clamped.
f58c7b35 2397 */
060f51c9
SH
2398int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2399 int nb_sectors, int *pnum)
f58c7b35 2400{
bd9533e3
SH
2401 int64_t n;
2402
2403 if (sector_num >= bs->total_sectors) {
2404 *pnum = 0;
2405 return 0;
2406 }
2407
2408 n = bs->total_sectors - sector_num;
2409 if (n < nb_sectors) {
2410 nb_sectors = n;
2411 }
2412
6aebab14 2413 if (!bs->drv->bdrv_co_is_allocated) {
bd9533e3 2414 *pnum = nb_sectors;
f58c7b35
TS
2415 return 1;
2416 }
6aebab14 2417
060f51c9
SH
2418 return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2419}
2420
2421/* Coroutine wrapper for bdrv_is_allocated() */
2422static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2423{
2424 BdrvCoIsAllocatedData *data = opaque;
2425 BlockDriverState *bs = data->bs;
2426
2427 data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2428 data->pnum);
2429 data->done = true;
2430}
2431
2432/*
2433 * Synchronous wrapper around bdrv_co_is_allocated().
2434 *
2435 * See bdrv_co_is_allocated() for details.
2436 */
2437int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2438 int *pnum)
2439{
6aebab14
SH
2440 Coroutine *co;
2441 BdrvCoIsAllocatedData data = {
2442 .bs = bs,
2443 .sector_num = sector_num,
2444 .nb_sectors = nb_sectors,
2445 .pnum = pnum,
2446 .done = false,
2447 };
2448
2449 co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2450 qemu_coroutine_enter(co, &data);
2451 while (!data.done) {
2452 qemu_aio_wait();
2453 }
2454 return data.ret;
f58c7b35
TS
2455}
2456
b2023818 2457BlockInfoList *qmp_query_block(Error **errp)
b338082b 2458{
b2023818 2459 BlockInfoList *head = NULL, *cur_item = NULL;
b338082b
FB
2460 BlockDriverState *bs;
2461
1b7bdbc1 2462 QTAILQ_FOREACH(bs, &bdrv_states, list) {
b2023818 2463 BlockInfoList *info = g_malloc0(sizeof(*info));
d15e5465 2464
b2023818
LC
2465 info->value = g_malloc0(sizeof(*info->value));
2466 info->value->device = g_strdup(bs->device_name);
2467 info->value->type = g_strdup("unknown");
2468 info->value->locked = bdrv_dev_is_medium_locked(bs);
2469 info->value->removable = bdrv_dev_has_removable_media(bs);
d15e5465 2470
e4def80b 2471 if (bdrv_dev_has_removable_media(bs)) {
b2023818
LC
2472 info->value->has_tray_open = true;
2473 info->value->tray_open = bdrv_dev_is_tray_open(bs);
e4def80b 2474 }
f04ef601
LC
2475
2476 if (bdrv_iostatus_is_enabled(bs)) {
b2023818
LC
2477 info->value->has_io_status = true;
2478 info->value->io_status = bs->iostatus;
f04ef601
LC
2479 }
2480
19cb3738 2481 if (bs->drv) {
b2023818
LC
2482 info->value->has_inserted = true;
2483 info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2484 info->value->inserted->file = g_strdup(bs->filename);
2485 info->value->inserted->ro = bs->read_only;
2486 info->value->inserted->drv = g_strdup(bs->drv->format_name);
2487 info->value->inserted->encrypted = bs->encrypted;
2488 if (bs->backing_file[0]) {
2489 info->value->inserted->has_backing_file = true;
2490 info->value->inserted->backing_file = g_strdup(bs->backing_file);
376253ec 2491 }
727f005e
ZYW
2492
2493 if (bs->io_limits_enabled) {
2494 info->value->inserted->bps =
2495 bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2496 info->value->inserted->bps_rd =
2497 bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2498 info->value->inserted->bps_wr =
2499 bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2500 info->value->inserted->iops =
2501 bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2502 info->value->inserted->iops_rd =
2503 bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2504 info->value->inserted->iops_wr =
2505 bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2506 }
b2023818 2507 }
d15e5465 2508
b2023818
LC
2509 /* XXX: waiting for the qapi to support GSList */
2510 if (!cur_item) {
2511 head = cur_item = info;
2512 } else {
2513 cur_item->next = info;
2514 cur_item = info;
b338082b 2515 }
b338082b 2516 }
d15e5465 2517
b2023818 2518 return head;
b338082b 2519}
a36e69dd 2520
f11f57e4
LC
2521/* Consider exposing this as a full fledged QMP command */
2522static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2523{
2524 BlockStats *s;
2525
2526 s = g_malloc0(sizeof(*s));
2527
2528 if (bs->device_name[0]) {
2529 s->has_device = true;
2530 s->device = g_strdup(bs->device_name);
294cc35f
KW
2531 }
2532
f11f57e4
LC
2533 s->stats = g_malloc0(sizeof(*s->stats));
2534 s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2535 s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2536 s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2537 s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2538 s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2539 s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2540 s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2541 s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2542 s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2543
294cc35f 2544 if (bs->file) {
f11f57e4
LC
2545 s->has_parent = true;
2546 s->parent = qmp_query_blockstat(bs->file, NULL);
294cc35f
KW
2547 }
2548
f11f57e4 2549 return s;
294cc35f
KW
2550}
2551
f11f57e4 2552BlockStatsList *qmp_query_blockstats(Error **errp)
218a536a 2553{
f11f57e4 2554 BlockStatsList *head = NULL, *cur_item = NULL;
a36e69dd
TS
2555 BlockDriverState *bs;
2556
1b7bdbc1 2557 QTAILQ_FOREACH(bs, &bdrv_states, list) {
f11f57e4
LC
2558 BlockStatsList *info = g_malloc0(sizeof(*info));
2559 info->value = qmp_query_blockstat(bs, NULL);
2560
2561 /* XXX: waiting for the qapi to support GSList */
2562 if (!cur_item) {
2563 head = cur_item = info;
2564 } else {
2565 cur_item->next = info;
2566 cur_item = info;
2567 }
a36e69dd 2568 }
218a536a 2569
f11f57e4 2570 return head;
a36e69dd 2571}
ea2384d3 2572
045df330
AL
2573const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2574{
2575 if (bs->backing_hd && bs->backing_hd->encrypted)
2576 return bs->backing_file;
2577 else if (bs->encrypted)
2578 return bs->filename;
2579 else
2580 return NULL;
2581}
2582
5fafdf24 2583void bdrv_get_backing_filename(BlockDriverState *bs,
83f64091
FB
2584 char *filename, int filename_size)
2585{
3574c608 2586 pstrcpy(filename, filename_size, bs->backing_file);
83f64091
FB
2587}
2588
5fafdf24 2589int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
faea38e7
FB
2590 const uint8_t *buf, int nb_sectors)
2591{
2592 BlockDriver *drv = bs->drv;
2593 if (!drv)
19cb3738 2594 return -ENOMEDIUM;
faea38e7
FB
2595 if (!drv->bdrv_write_compressed)
2596 return -ENOTSUP;
fbb7b4e0
KW
2597 if (bdrv_check_request(bs, sector_num, nb_sectors))
2598 return -EIO;
a55eb92c 2599
c6d22830 2600 if (bs->dirty_bitmap) {
7cd1e32a 2601 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2602 }
a55eb92c 2603
faea38e7
FB
2604 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2605}
3b46e624 2606
faea38e7
FB
2607int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2608{
2609 BlockDriver *drv = bs->drv;
2610 if (!drv)
19cb3738 2611 return -ENOMEDIUM;
faea38e7
FB
2612 if (!drv->bdrv_get_info)
2613 return -ENOTSUP;
2614 memset(bdi, 0, sizeof(*bdi));
2615 return drv->bdrv_get_info(bs, bdi);
2616}
2617
45566e9c
CH
2618int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2619 int64_t pos, int size)
178e08a5
AL
2620{
2621 BlockDriver *drv = bs->drv;
2622 if (!drv)
2623 return -ENOMEDIUM;
7cdb1f6d
MK
2624 if (drv->bdrv_save_vmstate)
2625 return drv->bdrv_save_vmstate(bs, buf, pos, size);
2626 if (bs->file)
2627 return bdrv_save_vmstate(bs->file, buf, pos, size);
2628 return -ENOTSUP;
178e08a5
AL
2629}
2630
45566e9c
CH
2631int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2632 int64_t pos, int size)
178e08a5
AL
2633{
2634 BlockDriver *drv = bs->drv;
2635 if (!drv)
2636 return -ENOMEDIUM;
7cdb1f6d
MK
2637 if (drv->bdrv_load_vmstate)
2638 return drv->bdrv_load_vmstate(bs, buf, pos, size);
2639 if (bs->file)
2640 return bdrv_load_vmstate(bs->file, buf, pos, size);
2641 return -ENOTSUP;
178e08a5
AL
2642}
2643
8b9b0cc2
KW
2644void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2645{
2646 BlockDriver *drv = bs->drv;
2647
2648 if (!drv || !drv->bdrv_debug_event) {
2649 return;
2650 }
2651
2652 return drv->bdrv_debug_event(bs, event);
2653
2654}
2655
faea38e7
FB
2656/**************************************************************/
2657/* handling of snapshots */
2658
feeee5ac
MDCF
2659int bdrv_can_snapshot(BlockDriverState *bs)
2660{
2661 BlockDriver *drv = bs->drv;
07b70bfb 2662 if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
feeee5ac
MDCF
2663 return 0;
2664 }
2665
2666 if (!drv->bdrv_snapshot_create) {
2667 if (bs->file != NULL) {
2668 return bdrv_can_snapshot(bs->file);
2669 }
2670 return 0;
2671 }
2672
2673 return 1;
2674}
2675
199630b6
BS
2676int bdrv_is_snapshot(BlockDriverState *bs)
2677{
2678 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2679}
2680
f9092b10
MA
2681BlockDriverState *bdrv_snapshots(void)
2682{
2683 BlockDriverState *bs;
2684
3ac906f7 2685 if (bs_snapshots) {
f9092b10 2686 return bs_snapshots;
3ac906f7 2687 }
f9092b10
MA
2688
2689 bs = NULL;
2690 while ((bs = bdrv_next(bs))) {
2691 if (bdrv_can_snapshot(bs)) {
3ac906f7
MA
2692 bs_snapshots = bs;
2693 return bs;
f9092b10
MA
2694 }
2695 }
2696 return NULL;
f9092b10
MA
2697}
2698
5fafdf24 2699int bdrv_snapshot_create(BlockDriverState *bs,
faea38e7
FB
2700 QEMUSnapshotInfo *sn_info)
2701{
2702 BlockDriver *drv = bs->drv;
2703 if (!drv)
19cb3738 2704 return -ENOMEDIUM;
7cdb1f6d
MK
2705 if (drv->bdrv_snapshot_create)
2706 return drv->bdrv_snapshot_create(bs, sn_info);
2707 if (bs->file)
2708 return bdrv_snapshot_create(bs->file, sn_info);
2709 return -ENOTSUP;
faea38e7
FB
2710}
2711
5fafdf24 2712int bdrv_snapshot_goto(BlockDriverState *bs,
faea38e7
FB
2713 const char *snapshot_id)
2714{
2715 BlockDriver *drv = bs->drv;
7cdb1f6d
MK
2716 int ret, open_ret;
2717
faea38e7 2718 if (!drv)
19cb3738 2719 return -ENOMEDIUM;
7cdb1f6d
MK
2720 if (drv->bdrv_snapshot_goto)
2721 return drv->bdrv_snapshot_goto(bs, snapshot_id);
2722
2723 if (bs->file) {
2724 drv->bdrv_close(bs);
2725 ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2726 open_ret = drv->bdrv_open(bs, bs->open_flags);
2727 if (open_ret < 0) {
2728 bdrv_delete(bs->file);
2729 bs->drv = NULL;
2730 return open_ret;
2731 }
2732 return ret;
2733 }
2734
2735 return -ENOTSUP;
faea38e7
FB
2736}
2737
2738int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2739{
2740 BlockDriver *drv = bs->drv;
2741 if (!drv)
19cb3738 2742 return -ENOMEDIUM;
7cdb1f6d
MK
2743 if (drv->bdrv_snapshot_delete)
2744 return drv->bdrv_snapshot_delete(bs, snapshot_id);
2745 if (bs->file)
2746 return bdrv_snapshot_delete(bs->file, snapshot_id);
2747 return -ENOTSUP;
faea38e7
FB
2748}
2749
5fafdf24 2750int bdrv_snapshot_list(BlockDriverState *bs,
faea38e7
FB
2751 QEMUSnapshotInfo **psn_info)
2752{
2753 BlockDriver *drv = bs->drv;
2754 if (!drv)
19cb3738 2755 return -ENOMEDIUM;
7cdb1f6d
MK
2756 if (drv->bdrv_snapshot_list)
2757 return drv->bdrv_snapshot_list(bs, psn_info);
2758 if (bs->file)
2759 return bdrv_snapshot_list(bs->file, psn_info);
2760 return -ENOTSUP;
faea38e7
FB
2761}
2762
51ef6727 2763int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2764 const char *snapshot_name)
2765{
2766 BlockDriver *drv = bs->drv;
2767 if (!drv) {
2768 return -ENOMEDIUM;
2769 }
2770 if (!bs->read_only) {
2771 return -EINVAL;
2772 }
2773 if (drv->bdrv_snapshot_load_tmp) {
2774 return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2775 }
2776 return -ENOTSUP;
2777}
2778
e8a6bb9c
MT
2779BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2780 const char *backing_file)
2781{
2782 if (!bs->drv) {
2783 return NULL;
2784 }
2785
2786 if (bs->backing_hd) {
2787 if (strcmp(bs->backing_file, backing_file) == 0) {
2788 return bs->backing_hd;
2789 } else {
2790 return bdrv_find_backing_image(bs->backing_hd, backing_file);
2791 }
2792 }
2793
2794 return NULL;
2795}
2796
faea38e7
FB
2797#define NB_SUFFIXES 4
2798
2799char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2800{
2801 static const char suffixes[NB_SUFFIXES] = "KMGT";
2802 int64_t base;
2803 int i;
2804
2805 if (size <= 999) {
2806 snprintf(buf, buf_size, "%" PRId64, size);
2807 } else {
2808 base = 1024;
2809 for(i = 0; i < NB_SUFFIXES; i++) {
2810 if (size < (10 * base)) {
5fafdf24 2811 snprintf(buf, buf_size, "%0.1f%c",
faea38e7
FB
2812 (double)size / base,
2813 suffixes[i]);
2814 break;
2815 } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
5fafdf24 2816 snprintf(buf, buf_size, "%" PRId64 "%c",
faea38e7
FB
2817 ((size + (base >> 1)) / base),
2818 suffixes[i]);
2819 break;
2820 }
2821 base = base * 1024;
2822 }
2823 }
2824 return buf;
2825}
2826
2827char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2828{
2829 char buf1[128], date_buf[128], clock_buf[128];
3b9f94e1
FB
2830#ifdef _WIN32
2831 struct tm *ptm;
2832#else
faea38e7 2833 struct tm tm;
3b9f94e1 2834#endif
faea38e7
FB
2835 time_t ti;
2836 int64_t secs;
2837
2838 if (!sn) {
5fafdf24
TS
2839 snprintf(buf, buf_size,
2840 "%-10s%-20s%7s%20s%15s",
faea38e7
FB
2841 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2842 } else {
2843 ti = sn->date_sec;
3b9f94e1
FB
2844#ifdef _WIN32
2845 ptm = localtime(&ti);
2846 strftime(date_buf, sizeof(date_buf),
2847 "%Y-%m-%d %H:%M:%S", ptm);
2848#else
faea38e7
FB
2849 localtime_r(&ti, &tm);
2850 strftime(date_buf, sizeof(date_buf),
2851 "%Y-%m-%d %H:%M:%S", &tm);
3b9f94e1 2852#endif
faea38e7
FB
2853 secs = sn->vm_clock_nsec / 1000000000;
2854 snprintf(clock_buf, sizeof(clock_buf),
2855 "%02d:%02d:%02d.%03d",
2856 (int)(secs / 3600),
2857 (int)((secs / 60) % 60),
5fafdf24 2858 (int)(secs % 60),
faea38e7
FB
2859 (int)((sn->vm_clock_nsec / 1000000) % 1000));
2860 snprintf(buf, buf_size,
5fafdf24 2861 "%-10s%-20s%7s%20s%15s",
faea38e7
FB
2862 sn->id_str, sn->name,
2863 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2864 date_buf,
2865 clock_buf);
2866 }
2867 return buf;
2868}
2869
ea2384d3 2870/**************************************************************/
83f64091 2871/* async I/Os */
ea2384d3 2872
3b69e4b9 2873BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
f141eafe 2874 QEMUIOVector *qiov, int nb_sectors,
3b69e4b9 2875 BlockDriverCompletionFunc *cb, void *opaque)
83f64091 2876{
bbf0a440
SH
2877 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2878
b2a61371 2879 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
8c5873d6 2880 cb, opaque, false);
ea2384d3
FB
2881}
2882
f141eafe
AL
2883BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2884 QEMUIOVector *qiov, int nb_sectors,
2885 BlockDriverCompletionFunc *cb, void *opaque)
ea2384d3 2886{
bbf0a440
SH
2887 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2888
1a6e115b 2889 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
8c5873d6 2890 cb, opaque, true);
83f64091
FB
2891}
2892
40b4f539
KW
2893
2894typedef struct MultiwriteCB {
2895 int error;
2896 int num_requests;
2897 int num_callbacks;
2898 struct {
2899 BlockDriverCompletionFunc *cb;
2900 void *opaque;
2901 QEMUIOVector *free_qiov;
40b4f539
KW
2902 } callbacks[];
2903} MultiwriteCB;
2904
2905static void multiwrite_user_cb(MultiwriteCB *mcb)
2906{
2907 int i;
2908
2909 for (i = 0; i < mcb->num_callbacks; i++) {
2910 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
1e1ea48d
SH
2911 if (mcb->callbacks[i].free_qiov) {
2912 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2913 }
7267c094 2914 g_free(mcb->callbacks[i].free_qiov);
40b4f539
KW
2915 }
2916}
2917
2918static void multiwrite_cb(void *opaque, int ret)
2919{
2920 MultiwriteCB *mcb = opaque;
2921
6d519a5f
SH
2922 trace_multiwrite_cb(mcb, ret);
2923
cb6d3ca0 2924 if (ret < 0 && !mcb->error) {
40b4f539 2925 mcb->error = ret;
40b4f539
KW
2926 }
2927
2928 mcb->num_requests--;
2929 if (mcb->num_requests == 0) {
de189a1b 2930 multiwrite_user_cb(mcb);
7267c094 2931 g_free(mcb);
40b4f539
KW
2932 }
2933}
2934
2935static int multiwrite_req_compare(const void *a, const void *b)
2936{
77be4366
CH
2937 const BlockRequest *req1 = a, *req2 = b;
2938
2939 /*
2940 * Note that we can't simply subtract req2->sector from req1->sector
2941 * here as that could overflow the return value.
2942 */
2943 if (req1->sector > req2->sector) {
2944 return 1;
2945 } else if (req1->sector < req2->sector) {
2946 return -1;
2947 } else {
2948 return 0;
2949 }
40b4f539
KW
2950}
2951
2952/*
2953 * Takes a bunch of requests and tries to merge them. Returns the number of
2954 * requests that remain after merging.
2955 */
2956static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
2957 int num_reqs, MultiwriteCB *mcb)
2958{
2959 int i, outidx;
2960
2961 // Sort requests by start sector
2962 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
2963
2964 // Check if adjacent requests touch the same clusters. If so, combine them,
2965 // filling up gaps with zero sectors.
2966 outidx = 0;
2967 for (i = 1; i < num_reqs; i++) {
2968 int merge = 0;
2969 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
2970
b6a127a1 2971 // Handle exactly sequential writes and overlapping writes.
40b4f539
KW
2972 if (reqs[i].sector <= oldreq_last) {
2973 merge = 1;
2974 }
2975
e2a305fb
CH
2976 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
2977 merge = 0;
2978 }
2979
40b4f539
KW
2980 if (merge) {
2981 size_t size;
7267c094 2982 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
40b4f539
KW
2983 qemu_iovec_init(qiov,
2984 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
2985
2986 // Add the first request to the merged one. If the requests are
2987 // overlapping, drop the last sectors of the first request.
2988 size = (reqs[i].sector - reqs[outidx].sector) << 9;
2989 qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
2990
b6a127a1
PB
2991 // We should need to add any zeros between the two requests
2992 assert (reqs[i].sector <= oldreq_last);
40b4f539
KW
2993
2994 // Add the second request
2995 qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
2996
cbf1dff2 2997 reqs[outidx].nb_sectors = qiov->size >> 9;
40b4f539
KW
2998 reqs[outidx].qiov = qiov;
2999
3000 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3001 } else {
3002 outidx++;
3003 reqs[outidx].sector = reqs[i].sector;
3004 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3005 reqs[outidx].qiov = reqs[i].qiov;
3006 }
3007 }
3008
3009 return outidx + 1;
3010}
3011
3012/*
3013 * Submit multiple AIO write requests at once.
3014 *
3015 * On success, the function returns 0 and all requests in the reqs array have
3016 * been submitted. In error case this function returns -1, and any of the
3017 * requests may or may not be submitted yet. In particular, this means that the
3018 * callback will be called for some of the requests, for others it won't. The
3019 * caller must check the error field of the BlockRequest to wait for the right
3020 * callbacks (if error != 0, no callback will be called).
3021 *
3022 * The implementation may modify the contents of the reqs array, e.g. to merge
3023 * requests. However, the fields opaque and error are left unmodified as they
3024 * are used to signal failure for a single request to the caller.
3025 */
3026int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3027{
40b4f539
KW
3028 MultiwriteCB *mcb;
3029 int i;
3030
301db7c2
RH
3031 /* don't submit writes if we don't have a medium */
3032 if (bs->drv == NULL) {
3033 for (i = 0; i < num_reqs; i++) {
3034 reqs[i].error = -ENOMEDIUM;
3035 }
3036 return -1;
3037 }
3038
40b4f539
KW
3039 if (num_reqs == 0) {
3040 return 0;
3041 }
3042
3043 // Create MultiwriteCB structure
7267c094 3044 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
40b4f539
KW
3045 mcb->num_requests = 0;
3046 mcb->num_callbacks = num_reqs;
3047
3048 for (i = 0; i < num_reqs; i++) {
3049 mcb->callbacks[i].cb = reqs[i].cb;
3050 mcb->callbacks[i].opaque = reqs[i].opaque;
3051 }
3052
3053 // Check for mergable requests
3054 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3055
6d519a5f
SH
3056 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3057
df9309fb
PB
3058 /* Run the aio requests. */
3059 mcb->num_requests = num_reqs;
40b4f539 3060 for (i = 0; i < num_reqs; i++) {
ad54ae80 3061 bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
40b4f539 3062 reqs[i].nb_sectors, multiwrite_cb, mcb);
40b4f539
KW
3063 }
3064
3065 return 0;
40b4f539
KW
3066}
3067
83f64091 3068void bdrv_aio_cancel(BlockDriverAIOCB *acb)
83f64091 3069{
6bbff9a0 3070 acb->pool->cancel(acb);
83f64091
FB
3071}
3072
98f90dba
ZYW
3073/* block I/O throttling */
3074static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3075 bool is_write, double elapsed_time, uint64_t *wait)
3076{
3077 uint64_t bps_limit = 0;
3078 double bytes_limit, bytes_base, bytes_res;
3079 double slice_time, wait_time;
3080
3081 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3082 bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3083 } else if (bs->io_limits.bps[is_write]) {
3084 bps_limit = bs->io_limits.bps[is_write];
3085 } else {
3086 if (wait) {
3087 *wait = 0;
3088 }
3089
3090 return false;
3091 }
3092
3093 slice_time = bs->slice_end - bs->slice_start;
3094 slice_time /= (NANOSECONDS_PER_SECOND);
3095 bytes_limit = bps_limit * slice_time;
3096 bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3097 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3098 bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3099 }
3100
3101 /* bytes_base: the bytes of data which have been read/written; and
3102 * it is obtained from the history statistic info.
3103 * bytes_res: the remaining bytes of data which need to be read/written.
3104 * (bytes_base + bytes_res) / bps_limit: used to calcuate
3105 * the total time for completing reading/writting all data.
3106 */
3107 bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3108
3109 if (bytes_base + bytes_res <= bytes_limit) {
3110 if (wait) {
3111 *wait = 0;
3112 }
3113
3114 return false;
3115 }
3116
3117 /* Calc approx time to dispatch */
3118 wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3119
3120 /* When the I/O rate at runtime exceeds the limits,
3121 * bs->slice_end need to be extended in order that the current statistic
3122 * info can be kept until the timer fire, so it is increased and tuned
3123 * based on the result of experiment.
3124 */
3125 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3126 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3127 if (wait) {
3128 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3129 }
3130
3131 return true;
3132}
3133
3134static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3135 double elapsed_time, uint64_t *wait)
3136{
3137 uint64_t iops_limit = 0;
3138 double ios_limit, ios_base;
3139 double slice_time, wait_time;
3140
3141 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3142 iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3143 } else if (bs->io_limits.iops[is_write]) {
3144 iops_limit = bs->io_limits.iops[is_write];
3145 } else {
3146 if (wait) {
3147 *wait = 0;
3148 }
3149
3150 return false;
3151 }
3152
3153 slice_time = bs->slice_end - bs->slice_start;
3154 slice_time /= (NANOSECONDS_PER_SECOND);
3155 ios_limit = iops_limit * slice_time;
3156 ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3157 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3158 ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3159 }
3160
3161 if (ios_base + 1 <= ios_limit) {
3162 if (wait) {
3163 *wait = 0;
3164 }
3165
3166 return false;
3167 }
3168
3169 /* Calc approx time to dispatch */
3170 wait_time = (ios_base + 1) / iops_limit;
3171 if (wait_time > elapsed_time) {
3172 wait_time = wait_time - elapsed_time;
3173 } else {
3174 wait_time = 0;
3175 }
3176
3177 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3178 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3179 if (wait) {
3180 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3181 }
3182
3183 return true;
3184}
3185
3186static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3187 bool is_write, int64_t *wait)
3188{
3189 int64_t now, max_wait;
3190 uint64_t bps_wait = 0, iops_wait = 0;
3191 double elapsed_time;
3192 int bps_ret, iops_ret;
3193
3194 now = qemu_get_clock_ns(vm_clock);
3195 if ((bs->slice_start < now)
3196 && (bs->slice_end > now)) {
3197 bs->slice_end = now + bs->slice_time;
3198 } else {
3199 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
3200 bs->slice_start = now;
3201 bs->slice_end = now + bs->slice_time;
3202
3203 bs->io_base.bytes[is_write] = bs->nr_bytes[is_write];
3204 bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3205
3206 bs->io_base.ios[is_write] = bs->nr_ops[is_write];
3207 bs->io_base.ios[!is_write] = bs->nr_ops[!is_write];
3208 }
3209
3210 elapsed_time = now - bs->slice_start;
3211 elapsed_time /= (NANOSECONDS_PER_SECOND);
3212
3213 bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors,
3214 is_write, elapsed_time, &bps_wait);
3215 iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3216 elapsed_time, &iops_wait);
3217 if (bps_ret || iops_ret) {
3218 max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3219 if (wait) {
3220 *wait = max_wait;
3221 }
3222
3223 now = qemu_get_clock_ns(vm_clock);
3224 if (bs->slice_end < now + max_wait) {
3225 bs->slice_end = now + max_wait;
3226 }
3227
3228 return true;
3229 }
3230
3231 if (wait) {
3232 *wait = 0;
3233 }
3234
3235 return false;
3236}
ce1a14dc 3237
83f64091
FB
3238/**************************************************************/
3239/* async block device emulation */
3240
c16b5a2c
CH
3241typedef struct BlockDriverAIOCBSync {
3242 BlockDriverAIOCB common;
3243 QEMUBH *bh;
3244 int ret;
3245 /* vector translation state */
3246 QEMUIOVector *qiov;
3247 uint8_t *bounce;
3248 int is_write;
3249} BlockDriverAIOCBSync;
3250
3251static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3252{
b666d239
KW
3253 BlockDriverAIOCBSync *acb =
3254 container_of(blockacb, BlockDriverAIOCBSync, common);
6a7ad299 3255 qemu_bh_delete(acb->bh);
36afc451 3256 acb->bh = NULL;
c16b5a2c
CH
3257 qemu_aio_release(acb);
3258}
3259
3260static AIOPool bdrv_em_aio_pool = {
3261 .aiocb_size = sizeof(BlockDriverAIOCBSync),
3262 .cancel = bdrv_aio_cancel_em,
3263};
3264
ce1a14dc 3265static void bdrv_aio_bh_cb(void *opaque)
83f64091 3266{
ce1a14dc 3267 BlockDriverAIOCBSync *acb = opaque;
f141eafe 3268
f141eafe
AL
3269 if (!acb->is_write)
3270 qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
ceb42de8 3271 qemu_vfree(acb->bounce);
ce1a14dc 3272 acb->common.cb(acb->common.opaque, acb->ret);
6a7ad299 3273 qemu_bh_delete(acb->bh);
36afc451 3274 acb->bh = NULL;
ce1a14dc 3275 qemu_aio_release(acb);
83f64091 3276}
beac80cd 3277
f141eafe
AL
3278static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3279 int64_t sector_num,
3280 QEMUIOVector *qiov,
3281 int nb_sectors,
3282 BlockDriverCompletionFunc *cb,
3283 void *opaque,
3284 int is_write)
3285
83f64091 3286{
ce1a14dc 3287 BlockDriverAIOCBSync *acb;
ce1a14dc 3288
c16b5a2c 3289 acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
f141eafe
AL
3290 acb->is_write = is_write;
3291 acb->qiov = qiov;
e268ca52 3292 acb->bounce = qemu_blockalign(bs, qiov->size);
3f3aace8 3293 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
f141eafe
AL
3294
3295 if (is_write) {
3296 qemu_iovec_to_buffer(acb->qiov, acb->bounce);
1ed20acf 3297 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
f141eafe 3298 } else {
1ed20acf 3299 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
f141eafe
AL
3300 }
3301
ce1a14dc 3302 qemu_bh_schedule(acb->bh);
f141eafe 3303
ce1a14dc 3304 return &acb->common;
beac80cd
FB
3305}
3306
f141eafe
AL
3307static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3308 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 3309 BlockDriverCompletionFunc *cb, void *opaque)
beac80cd 3310{
f141eafe
AL
3311 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3312}
83f64091 3313
f141eafe
AL
3314static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3315 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3316 BlockDriverCompletionFunc *cb, void *opaque)
3317{
3318 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
beac80cd 3319}
beac80cd 3320
68485420
KW
3321
3322typedef struct BlockDriverAIOCBCoroutine {
3323 BlockDriverAIOCB common;
3324 BlockRequest req;
3325 bool is_write;
3326 QEMUBH* bh;
3327} BlockDriverAIOCBCoroutine;
3328
3329static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3330{
3331 qemu_aio_flush();
3332}
3333
3334static AIOPool bdrv_em_co_aio_pool = {
3335 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
3336 .cancel = bdrv_aio_co_cancel_em,
3337};
3338
35246a68 3339static void bdrv_co_em_bh(void *opaque)
68485420
KW
3340{
3341 BlockDriverAIOCBCoroutine *acb = opaque;
3342
3343 acb->common.cb(acb->common.opaque, acb->req.error);
3344 qemu_bh_delete(acb->bh);
3345 qemu_aio_release(acb);
3346}
3347
b2a61371
SH
3348/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3349static void coroutine_fn bdrv_co_do_rw(void *opaque)
3350{
3351 BlockDriverAIOCBCoroutine *acb = opaque;
3352 BlockDriverState *bs = acb->common.bs;
3353
3354 if (!acb->is_write) {
3355 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
470c0504 3356 acb->req.nb_sectors, acb->req.qiov, 0);
b2a61371
SH
3357 } else {
3358 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
f08f2dda 3359 acb->req.nb_sectors, acb->req.qiov, 0);
b2a61371
SH
3360 }
3361
35246a68 3362 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
b2a61371
SH
3363 qemu_bh_schedule(acb->bh);
3364}
3365
68485420
KW
3366static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3367 int64_t sector_num,
3368 QEMUIOVector *qiov,
3369 int nb_sectors,
3370 BlockDriverCompletionFunc *cb,
3371 void *opaque,
8c5873d6 3372 bool is_write)
68485420
KW
3373{
3374 Coroutine *co;
3375 BlockDriverAIOCBCoroutine *acb;
3376
3377 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3378 acb->req.sector = sector_num;
3379 acb->req.nb_sectors = nb_sectors;
3380 acb->req.qiov = qiov;
3381 acb->is_write = is_write;
3382
8c5873d6 3383 co = qemu_coroutine_create(bdrv_co_do_rw);
68485420
KW
3384 qemu_coroutine_enter(co, acb);
3385
3386 return &acb->common;
3387}
3388
07f07615 3389static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
b2e12bc6 3390{
07f07615
PB
3391 BlockDriverAIOCBCoroutine *acb = opaque;
3392 BlockDriverState *bs = acb->common.bs;
b2e12bc6 3393
07f07615
PB
3394 acb->req.error = bdrv_co_flush(bs);
3395 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
b2e12bc6 3396 qemu_bh_schedule(acb->bh);
b2e12bc6
CH
3397}
3398
07f07615 3399BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
016f5cf6
AG
3400 BlockDriverCompletionFunc *cb, void *opaque)
3401{
07f07615 3402 trace_bdrv_aio_flush(bs, opaque);
016f5cf6 3403
07f07615
PB
3404 Coroutine *co;
3405 BlockDriverAIOCBCoroutine *acb;
016f5cf6 3406
07f07615
PB
3407 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3408 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3409 qemu_coroutine_enter(co, acb);
016f5cf6 3410
016f5cf6
AG
3411 return &acb->common;
3412}
3413
4265d620
PB
3414static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3415{
3416 BlockDriverAIOCBCoroutine *acb = opaque;
3417 BlockDriverState *bs = acb->common.bs;
3418
3419 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3420 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3421 qemu_bh_schedule(acb->bh);
3422}
3423
3424BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3425 int64_t sector_num, int nb_sectors,
3426 BlockDriverCompletionFunc *cb, void *opaque)
3427{
3428 Coroutine *co;
3429 BlockDriverAIOCBCoroutine *acb;
3430
3431 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3432
3433 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3434 acb->req.sector = sector_num;
3435 acb->req.nb_sectors = nb_sectors;
3436 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3437 qemu_coroutine_enter(co, acb);
3438
3439 return &acb->common;
3440}
3441
ea2384d3
FB
3442void bdrv_init(void)
3443{
5efa9d5a 3444 module_call_init(MODULE_INIT_BLOCK);
ea2384d3 3445}
ce1a14dc 3446
eb852011
MA
3447void bdrv_init_with_whitelist(void)
3448{
3449 use_bdrv_whitelist = 1;
3450 bdrv_init();
3451}
3452
c16b5a2c
CH
3453void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3454 BlockDriverCompletionFunc *cb, void *opaque)
ce1a14dc 3455{
ce1a14dc
PB
3456 BlockDriverAIOCB *acb;
3457
6bbff9a0
AL
3458 if (pool->free_aiocb) {
3459 acb = pool->free_aiocb;
3460 pool->free_aiocb = acb->next;
ce1a14dc 3461 } else {
7267c094 3462 acb = g_malloc0(pool->aiocb_size);
6bbff9a0 3463 acb->pool = pool;
ce1a14dc
PB
3464 }
3465 acb->bs = bs;
3466 acb->cb = cb;
3467 acb->opaque = opaque;
3468 return acb;
3469}
3470
3471void qemu_aio_release(void *p)
3472{
6bbff9a0
AL
3473 BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3474 AIOPool *pool = acb->pool;
3475 acb->next = pool->free_aiocb;
3476 pool->free_aiocb = acb;
ce1a14dc 3477}
19cb3738 3478
f9f05dc5
KW
3479/**************************************************************/
3480/* Coroutine block device emulation */
3481
3482typedef struct CoroutineIOCompletion {
3483 Coroutine *coroutine;
3484 int ret;
3485} CoroutineIOCompletion;
3486
3487static void bdrv_co_io_em_complete(void *opaque, int ret)
3488{
3489 CoroutineIOCompletion *co = opaque;
3490
3491 co->ret = ret;
3492 qemu_coroutine_enter(co->coroutine, NULL);
3493}
3494
3495static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3496 int nb_sectors, QEMUIOVector *iov,
3497 bool is_write)
3498{
3499 CoroutineIOCompletion co = {
3500 .coroutine = qemu_coroutine_self(),
3501 };
3502 BlockDriverAIOCB *acb;
3503
3504 if (is_write) {
a652d160
SH
3505 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3506 bdrv_co_io_em_complete, &co);
f9f05dc5 3507 } else {
a652d160
SH
3508 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3509 bdrv_co_io_em_complete, &co);
f9f05dc5
KW
3510 }
3511
59370aaa 3512 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
f9f05dc5
KW
3513 if (!acb) {
3514 return -EIO;
3515 }
3516 qemu_coroutine_yield();
3517
3518 return co.ret;
3519}
3520
3521static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3522 int64_t sector_num, int nb_sectors,
3523 QEMUIOVector *iov)
3524{
3525 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3526}
3527
3528static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3529 int64_t sector_num, int nb_sectors,
3530 QEMUIOVector *iov)
3531{
3532 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3533}
3534
07f07615 3535static void coroutine_fn bdrv_flush_co_entry(void *opaque)
e7a8a783 3536{
07f07615
PB
3537 RwCo *rwco = opaque;
3538
3539 rwco->ret = bdrv_co_flush(rwco->bs);
3540}
3541
3542int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3543{
eb489bb1
KW
3544 int ret;
3545
29cdb251 3546 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
07f07615 3547 return 0;
eb489bb1
KW
3548 }
3549
ca716364 3550 /* Write back cached data to the OS even with cache=unsafe */
eb489bb1
KW
3551 if (bs->drv->bdrv_co_flush_to_os) {
3552 ret = bs->drv->bdrv_co_flush_to_os(bs);
3553 if (ret < 0) {
3554 return ret;
3555 }
3556 }
3557
ca716364
KW
3558 /* But don't actually force it to the disk with cache=unsafe */
3559 if (bs->open_flags & BDRV_O_NO_FLUSH) {
3560 return 0;
3561 }
3562
eb489bb1 3563 if (bs->drv->bdrv_co_flush_to_disk) {
29cdb251 3564 ret = bs->drv->bdrv_co_flush_to_disk(bs);
07f07615
PB
3565 } else if (bs->drv->bdrv_aio_flush) {
3566 BlockDriverAIOCB *acb;
3567 CoroutineIOCompletion co = {
3568 .coroutine = qemu_coroutine_self(),
3569 };
3570
3571 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3572 if (acb == NULL) {
29cdb251 3573 ret = -EIO;
07f07615
PB
3574 } else {
3575 qemu_coroutine_yield();
29cdb251 3576 ret = co.ret;
07f07615 3577 }
07f07615
PB
3578 } else {
3579 /*
3580 * Some block drivers always operate in either writethrough or unsafe
3581 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3582 * know how the server works (because the behaviour is hardcoded or
3583 * depends on server-side configuration), so we can't ensure that
3584 * everything is safe on disk. Returning an error doesn't work because
3585 * that would break guests even if the server operates in writethrough
3586 * mode.
3587 *
3588 * Let's hope the user knows what he's doing.
3589 */
29cdb251 3590 ret = 0;
07f07615 3591 }
29cdb251
PB
3592 if (ret < 0) {
3593 return ret;
3594 }
3595
3596 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
3597 * in the case of cache=unsafe, so there are no useless flushes.
3598 */
3599 return bdrv_co_flush(bs->file);
07f07615
PB
3600}
3601
0f15423c
AL
3602void bdrv_invalidate_cache(BlockDriverState *bs)
3603{
3604 if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3605 bs->drv->bdrv_invalidate_cache(bs);
3606 }
3607}
3608
3609void bdrv_invalidate_cache_all(void)
3610{
3611 BlockDriverState *bs;
3612
3613 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3614 bdrv_invalidate_cache(bs);
3615 }
3616}
3617
07f07615
PB
3618int bdrv_flush(BlockDriverState *bs)
3619{
3620 Coroutine *co;
3621 RwCo rwco = {
3622 .bs = bs,
3623 .ret = NOT_DONE,
e7a8a783 3624 };
e7a8a783 3625
07f07615
PB
3626 if (qemu_in_coroutine()) {
3627 /* Fast-path if already in coroutine context */
3628 bdrv_flush_co_entry(&rwco);
3629 } else {
3630 co = qemu_coroutine_create(bdrv_flush_co_entry);
3631 qemu_coroutine_enter(co, &rwco);
3632 while (rwco.ret == NOT_DONE) {
3633 qemu_aio_wait();
3634 }
e7a8a783 3635 }
07f07615
PB
3636
3637 return rwco.ret;
e7a8a783
KW
3638}
3639
4265d620
PB
3640static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3641{
3642 RwCo *rwco = opaque;
3643
3644 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3645}
3646
3647int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3648 int nb_sectors)
3649{
3650 if (!bs->drv) {
3651 return -ENOMEDIUM;
3652 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3653 return -EIO;
3654 } else if (bs->read_only) {
3655 return -EROFS;
3656 } else if (bs->drv->bdrv_co_discard) {
3657 return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3658 } else if (bs->drv->bdrv_aio_discard) {
3659 BlockDriverAIOCB *acb;
3660 CoroutineIOCompletion co = {
3661 .coroutine = qemu_coroutine_self(),
3662 };
3663
3664 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3665 bdrv_co_io_em_complete, &co);
3666 if (acb == NULL) {
3667 return -EIO;
3668 } else {
3669 qemu_coroutine_yield();
3670 return co.ret;
3671 }
4265d620
PB
3672 } else {
3673 return 0;
3674 }
3675}
3676
3677int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3678{
3679 Coroutine *co;
3680 RwCo rwco = {
3681 .bs = bs,
3682 .sector_num = sector_num,
3683 .nb_sectors = nb_sectors,
3684 .ret = NOT_DONE,
3685 };
3686
3687 if (qemu_in_coroutine()) {
3688 /* Fast-path if already in coroutine context */
3689 bdrv_discard_co_entry(&rwco);
3690 } else {
3691 co = qemu_coroutine_create(bdrv_discard_co_entry);
3692 qemu_coroutine_enter(co, &rwco);
3693 while (rwco.ret == NOT_DONE) {
3694 qemu_aio_wait();
3695 }
3696 }
3697
3698 return rwco.ret;
3699}
3700
19cb3738
FB
3701/**************************************************************/
3702/* removable device support */
3703
3704/**
3705 * Return TRUE if the media is present
3706 */
3707int bdrv_is_inserted(BlockDriverState *bs)
3708{
3709 BlockDriver *drv = bs->drv;
a1aff5bf 3710
19cb3738
FB
3711 if (!drv)
3712 return 0;
3713 if (!drv->bdrv_is_inserted)
a1aff5bf
MA
3714 return 1;
3715 return drv->bdrv_is_inserted(bs);
19cb3738
FB
3716}
3717
3718/**
8e49ca46
MA
3719 * Return whether the media changed since the last call to this
3720 * function, or -ENOTSUP if we don't know. Most drivers don't know.
19cb3738
FB
3721 */
3722int bdrv_media_changed(BlockDriverState *bs)
3723{
3724 BlockDriver *drv = bs->drv;
19cb3738 3725
8e49ca46
MA
3726 if (drv && drv->bdrv_media_changed) {
3727 return drv->bdrv_media_changed(bs);
3728 }
3729 return -ENOTSUP;
19cb3738
FB
3730}
3731
3732/**
3733 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3734 */
f36f3949 3735void bdrv_eject(BlockDriverState *bs, bool eject_flag)
19cb3738
FB
3736{
3737 BlockDriver *drv = bs->drv;
19cb3738 3738
822e1cd1
MA
3739 if (drv && drv->bdrv_eject) {
3740 drv->bdrv_eject(bs, eject_flag);
19cb3738 3741 }
6f382ed2
LC
3742
3743 if (bs->device_name[0] != '\0') {
3744 bdrv_emit_qmp_eject_event(bs, eject_flag);
3745 }
19cb3738
FB
3746}
3747
19cb3738
FB
3748/**
3749 * Lock or unlock the media (if it is locked, the user won't be able
3750 * to eject it manually).
3751 */
025e849a 3752void bdrv_lock_medium(BlockDriverState *bs, bool locked)
19cb3738
FB
3753{
3754 BlockDriver *drv = bs->drv;
3755
025e849a 3756 trace_bdrv_lock_medium(bs, locked);
b8c6d095 3757
025e849a
MA
3758 if (drv && drv->bdrv_lock_medium) {
3759 drv->bdrv_lock_medium(bs, locked);
19cb3738
FB
3760 }
3761}
985a03b0
TS
3762
3763/* needed for generic scsi interface */
3764
3765int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3766{
3767 BlockDriver *drv = bs->drv;
3768
3769 if (drv && drv->bdrv_ioctl)
3770 return drv->bdrv_ioctl(bs, req, buf);
3771 return -ENOTSUP;
3772}
7d780669 3773
221f715d
AL
3774BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3775 unsigned long int req, void *buf,
3776 BlockDriverCompletionFunc *cb, void *opaque)
7d780669 3777{
221f715d 3778 BlockDriver *drv = bs->drv;
7d780669 3779
221f715d
AL
3780 if (drv && drv->bdrv_aio_ioctl)
3781 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3782 return NULL;
7d780669 3783}
e268ca52 3784
7b6f9300
MA
3785void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3786{
3787 bs->buffer_alignment = align;
3788}
7cd1e32a 3789
e268ca52
AL
3790void *qemu_blockalign(BlockDriverState *bs, size_t size)
3791{
3792 return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3793}
7cd1e32a 3794
3795void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3796{
3797 int64_t bitmap_size;
a55eb92c 3798
aaa0eb75 3799 bs->dirty_count = 0;
a55eb92c 3800 if (enable) {
c6d22830
JK
3801 if (!bs->dirty_bitmap) {
3802 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3803 BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
3804 bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
a55eb92c 3805
7267c094 3806 bs->dirty_bitmap = g_malloc0(bitmap_size);
a55eb92c 3807 }
7cd1e32a 3808 } else {
c6d22830 3809 if (bs->dirty_bitmap) {
7267c094 3810 g_free(bs->dirty_bitmap);
c6d22830 3811 bs->dirty_bitmap = NULL;
a55eb92c 3812 }
7cd1e32a 3813 }
3814}
3815
3816int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3817{
6ea44308 3818 int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
a55eb92c 3819
c6d22830
JK
3820 if (bs->dirty_bitmap &&
3821 (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
6d59fec1
MT
3822 return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3823 (1UL << (chunk % (sizeof(unsigned long) * 8))));
7cd1e32a 3824 } else {
3825 return 0;
3826 }
3827}
3828
a55eb92c
JK
3829void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3830 int nr_sectors)
7cd1e32a 3831{
3832 set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3833}
aaa0eb75
LS
3834
3835int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3836{
3837 return bs->dirty_count;
3838}
f88e1a42 3839
db593f25
MT
3840void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3841{
3842 assert(bs->in_use != in_use);
3843 bs->in_use = in_use;
3844}
3845
3846int bdrv_in_use(BlockDriverState *bs)
3847{
3848 return bs->in_use;
3849}
3850
28a7282a
LC
3851void bdrv_iostatus_enable(BlockDriverState *bs)
3852{
d6bf279e 3853 bs->iostatus_enabled = true;
58e21ef5 3854 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
28a7282a
LC
3855}
3856
3857/* The I/O status is only enabled if the drive explicitly
3858 * enables it _and_ the VM is configured to stop on errors */
3859bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3860{
d6bf279e 3861 return (bs->iostatus_enabled &&
28a7282a
LC
3862 (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3863 bs->on_write_error == BLOCK_ERR_STOP_ANY ||
3864 bs->on_read_error == BLOCK_ERR_STOP_ANY));
3865}
3866
3867void bdrv_iostatus_disable(BlockDriverState *bs)
3868{
d6bf279e 3869 bs->iostatus_enabled = false;
28a7282a
LC
3870}
3871
3872void bdrv_iostatus_reset(BlockDriverState *bs)
3873{
3874 if (bdrv_iostatus_is_enabled(bs)) {
58e21ef5 3875 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
28a7282a
LC
3876 }
3877}
3878
3879/* XXX: Today this is set by device models because it makes the implementation
3880 quite simple. However, the block layer knows about the error, so it's
3881 possible to implement this without device models being involved */
3882void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3883{
58e21ef5
LC
3884 if (bdrv_iostatus_is_enabled(bs) &&
3885 bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
28a7282a 3886 assert(error >= 0);
58e21ef5
LC
3887 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
3888 BLOCK_DEVICE_IO_STATUS_FAILED;
28a7282a
LC
3889 }
3890}
3891
a597e79c
CH
3892void
3893bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
3894 enum BlockAcctType type)
3895{
3896 assert(type < BDRV_MAX_IOTYPE);
3897
3898 cookie->bytes = bytes;
c488c7f6 3899 cookie->start_time_ns = get_clock();
a597e79c
CH
3900 cookie->type = type;
3901}
3902
3903void
3904bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
3905{
3906 assert(cookie->type < BDRV_MAX_IOTYPE);
3907
3908 bs->nr_bytes[cookie->type] += cookie->bytes;
3909 bs->nr_ops[cookie->type]++;
c488c7f6 3910 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
a597e79c
CH
3911}
3912
f88e1a42
JS
3913int bdrv_img_create(const char *filename, const char *fmt,
3914 const char *base_filename, const char *base_fmt,
3915 char *options, uint64_t img_size, int flags)
3916{
3917 QEMUOptionParameter *param = NULL, *create_options = NULL;
d220894e 3918 QEMUOptionParameter *backing_fmt, *backing_file, *size;
f88e1a42
JS
3919 BlockDriverState *bs = NULL;
3920 BlockDriver *drv, *proto_drv;
96df67d1 3921 BlockDriver *backing_drv = NULL;
f88e1a42
JS
3922 int ret = 0;
3923
3924 /* Find driver and parse its options */
3925 drv = bdrv_find_format(fmt);
3926 if (!drv) {
3927 error_report("Unknown file format '%s'", fmt);
4f70f249 3928 ret = -EINVAL;
f88e1a42
JS
3929 goto out;
3930 }
3931
3932 proto_drv = bdrv_find_protocol(filename);
3933 if (!proto_drv) {
3934 error_report("Unknown protocol '%s'", filename);
4f70f249 3935 ret = -EINVAL;
f88e1a42
JS
3936 goto out;
3937 }
3938
3939 create_options = append_option_parameters(create_options,
3940 drv->create_options);
3941 create_options = append_option_parameters(create_options,
3942 proto_drv->create_options);
3943
3944 /* Create parameter list with default values */
3945 param = parse_option_parameters("", create_options, param);
3946
3947 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
3948
3949 /* Parse -o options */
3950 if (options) {
3951 param = parse_option_parameters(options, create_options, param);
3952 if (param == NULL) {
3953 error_report("Invalid options for file format '%s'.", fmt);
4f70f249 3954 ret = -EINVAL;
f88e1a42
JS
3955 goto out;
3956 }
3957 }
3958
3959 if (base_filename) {
3960 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
3961 base_filename)) {
3962 error_report("Backing file not supported for file format '%s'",
3963 fmt);
4f70f249 3964 ret = -EINVAL;
f88e1a42
JS
3965 goto out;
3966 }
3967 }
3968
3969 if (base_fmt) {
3970 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
3971 error_report("Backing file format not supported for file "
3972 "format '%s'", fmt);
4f70f249 3973 ret = -EINVAL;
f88e1a42
JS
3974 goto out;
3975 }
3976 }
3977
792da93a
JS
3978 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
3979 if (backing_file && backing_file->value.s) {
3980 if (!strcmp(filename, backing_file->value.s)) {
3981 error_report("Error: Trying to create an image with the "
3982 "same filename as the backing file");
4f70f249 3983 ret = -EINVAL;
792da93a
JS
3984 goto out;
3985 }
3986 }
3987
f88e1a42
JS
3988 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
3989 if (backing_fmt && backing_fmt->value.s) {
96df67d1
SH
3990 backing_drv = bdrv_find_format(backing_fmt->value.s);
3991 if (!backing_drv) {
f88e1a42
JS
3992 error_report("Unknown backing file format '%s'",
3993 backing_fmt->value.s);
4f70f249 3994 ret = -EINVAL;
f88e1a42
JS
3995 goto out;
3996 }
3997 }
3998
3999 // The size for the image must always be specified, with one exception:
4000 // If we are using a backing file, we can obtain the size from there
d220894e
KW
4001 size = get_option_parameter(param, BLOCK_OPT_SIZE);
4002 if (size && size->value.n == -1) {
f88e1a42
JS
4003 if (backing_file && backing_file->value.s) {
4004 uint64_t size;
f88e1a42
JS
4005 char buf[32];
4006
f88e1a42
JS
4007 bs = bdrv_new("");
4008
96df67d1 4009 ret = bdrv_open(bs, backing_file->value.s, flags, backing_drv);
f88e1a42 4010 if (ret < 0) {
96df67d1 4011 error_report("Could not open '%s'", backing_file->value.s);
f88e1a42
JS
4012 goto out;
4013 }
4014 bdrv_get_geometry(bs, &size);
4015 size *= 512;
4016
4017 snprintf(buf, sizeof(buf), "%" PRId64, size);
4018 set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4019 } else {
4020 error_report("Image creation needs a size parameter");
4f70f249 4021 ret = -EINVAL;
f88e1a42
JS
4022 goto out;
4023 }
4024 }
4025
4026 printf("Formatting '%s', fmt=%s ", filename, fmt);
4027 print_option_parameters(param);
4028 puts("");
4029
4030 ret = bdrv_create(drv, filename, param);
4031
4032 if (ret < 0) {
4033 if (ret == -ENOTSUP) {
4034 error_report("Formatting or formatting option not supported for "
4035 "file format '%s'", fmt);
4036 } else if (ret == -EFBIG) {
4037 error_report("The image size is too large for file format '%s'",
4038 fmt);
4039 } else {
4040 error_report("%s: error while creating %s: %s", filename, fmt,
4041 strerror(-ret));
4042 }
4043 }
4044
4045out:
4046 free_option_parameters(create_options);
4047 free_option_parameters(param);
4048
4049 if (bs) {
4050 bdrv_delete(bs);
4051 }
4f70f249
JS
4052
4053 return ret;
f88e1a42 4054}
eeec61f2
SH
4055
4056void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
4057 BlockDriverCompletionFunc *cb, void *opaque)
4058{
4059 BlockJob *job;
4060
4061 if (bs->job || bdrv_in_use(bs)) {
4062 return NULL;
4063 }
4064 bdrv_set_in_use(bs, 1);
4065
4066 job = g_malloc0(job_type->instance_size);
4067 job->job_type = job_type;
4068 job->bs = bs;
4069 job->cb = cb;
4070 job->opaque = opaque;
4071 bs->job = job;
4072 return job;
4073}
4074
4075void block_job_complete(BlockJob *job, int ret)
4076{
4077 BlockDriverState *bs = job->bs;
4078
4079 assert(bs->job == job);
4080 job->cb(job->opaque, ret);
4081 bs->job = NULL;
4082 g_free(job);
4083 bdrv_set_in_use(bs, 0);
4084}
4085
4086int block_job_set_speed(BlockJob *job, int64_t value)
4087{
9f25eccc
PB
4088 int rc;
4089
eeec61f2
SH
4090 if (!job->job_type->set_speed) {
4091 return -ENOTSUP;
4092 }
9f25eccc
PB
4093 rc = job->job_type->set_speed(job, value);
4094 if (rc == 0) {
4095 job->speed = value;
4096 }
4097 return rc;
eeec61f2
SH
4098}
4099
4100void block_job_cancel(BlockJob *job)
4101{
4102 job->cancelled = true;
4103}
4104
4105bool block_job_is_cancelled(BlockJob *job)
4106{
4107 return job->cancelled;
4108}
3e914655
PB
4109
4110void block_job_cancel_sync(BlockJob *job)
4111{
4112 BlockDriverState *bs = job->bs;
4113
4114 assert(bs->job == job);
4115 block_job_cancel(job);
4116 while (bs->job != NULL && bs->job->busy) {
4117 qemu_aio_wait();
4118 }
4119}