]> git.proxmox.com Git - qemu.git/blame - block.c
qcow2: Simplify count_cow_clusters
[qemu.git] / block.c
CommitLineData
fc01f7e7
FB
1/*
2 * QEMU System Emulator block driver
5fafdf24 3 *
fc01f7e7 4 * Copyright (c) 2003 Fabrice Bellard
5fafdf24 5 *
fc01f7e7
FB
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
3990d09a 24#include "config-host.h"
faf07963 25#include "qemu-common.h"
6d519a5f 26#include "trace.h"
376253ec 27#include "monitor.h"
ea2384d3 28#include "block_int.h"
5efa9d5a 29#include "module.h"
f795e743 30#include "qjson.h"
68485420 31#include "qemu-coroutine.h"
b2023818 32#include "qmp-commands.h"
0563e191 33#include "qemu-timer.h"
fc01f7e7 34
71e72a19 35#ifdef CONFIG_BSD
7674e7bf
FB
36#include <sys/types.h>
37#include <sys/stat.h>
38#include <sys/ioctl.h>
72cf2d4f 39#include <sys/queue.h>
c5e97233 40#ifndef __DragonFly__
7674e7bf
FB
41#include <sys/disk.h>
42#endif
c5e97233 43#endif
7674e7bf 44
49dc768d
AL
45#ifdef _WIN32
46#include <windows.h>
47#endif
48
1c9805a3
SH
49#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
50
470c0504
SH
51typedef enum {
52 BDRV_REQ_COPY_ON_READ = 0x1,
f08f2dda 53 BDRV_REQ_ZERO_WRITE = 0x2,
470c0504
SH
54} BdrvRequestFlags;
55
7d4b4ba5 56static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
f141eafe
AL
57static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
c87c0672 59 BlockDriverCompletionFunc *cb, void *opaque);
f141eafe
AL
60static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 62 BlockDriverCompletionFunc *cb, void *opaque);
f9f05dc5
KW
63static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64 int64_t sector_num, int nb_sectors,
65 QEMUIOVector *iov);
66static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67 int64_t sector_num, int nb_sectors,
68 QEMUIOVector *iov);
c5fbe571 69static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
470c0504
SH
70 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71 BdrvRequestFlags flags);
1c9805a3 72static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
f08f2dda
SH
73 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74 BdrvRequestFlags flags);
b2a61371
SH
75static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
76 int64_t sector_num,
77 QEMUIOVector *qiov,
78 int nb_sectors,
79 BlockDriverCompletionFunc *cb,
80 void *opaque,
8c5873d6 81 bool is_write);
b2a61371 82static void coroutine_fn bdrv_co_do_rw(void *opaque);
ec530c81 83
98f90dba
ZYW
84static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
85 bool is_write, double elapsed_time, uint64_t *wait);
86static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
87 double elapsed_time, uint64_t *wait);
88static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
89 bool is_write, int64_t *wait);
90
1b7bdbc1
SH
91static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92 QTAILQ_HEAD_INITIALIZER(bdrv_states);
7ee930d0 93
8a22f02a
SH
94static QLIST_HEAD(, BlockDriver) bdrv_drivers =
95 QLIST_HEAD_INITIALIZER(bdrv_drivers);
ea2384d3 96
f9092b10
MA
97/* The device to use for VM snapshots */
98static BlockDriverState *bs_snapshots;
99
eb852011
MA
100/* If non-zero, use only whitelisted block drivers */
101static int use_bdrv_whitelist;
102
9e0b22f4
SH
103#ifdef _WIN32
104static int is_windows_drive_prefix(const char *filename)
105{
106 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108 filename[1] == ':');
109}
110
111int is_windows_drive(const char *filename)
112{
113 if (is_windows_drive_prefix(filename) &&
114 filename[2] == '\0')
115 return 1;
116 if (strstart(filename, "\\\\.\\", NULL) ||
117 strstart(filename, "//./", NULL))
118 return 1;
119 return 0;
120}
121#endif
122
0563e191 123/* throttling disk I/O limits */
98f90dba
ZYW
124void bdrv_io_limits_disable(BlockDriverState *bs)
125{
126 bs->io_limits_enabled = false;
127
128 while (qemu_co_queue_next(&bs->throttled_reqs));
129
130 if (bs->block_timer) {
131 qemu_del_timer(bs->block_timer);
132 qemu_free_timer(bs->block_timer);
133 bs->block_timer = NULL;
134 }
135
136 bs->slice_start = 0;
137 bs->slice_end = 0;
138 bs->slice_time = 0;
139 memset(&bs->io_base, 0, sizeof(bs->io_base));
140}
141
0563e191
ZYW
142static void bdrv_block_timer(void *opaque)
143{
144 BlockDriverState *bs = opaque;
145
146 qemu_co_queue_next(&bs->throttled_reqs);
147}
148
149void bdrv_io_limits_enable(BlockDriverState *bs)
150{
151 qemu_co_queue_init(&bs->throttled_reqs);
152 bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
153 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
154 bs->slice_start = qemu_get_clock_ns(vm_clock);
155 bs->slice_end = bs->slice_start + bs->slice_time;
156 memset(&bs->io_base, 0, sizeof(bs->io_base));
157 bs->io_limits_enabled = true;
158}
159
160bool bdrv_io_limits_enabled(BlockDriverState *bs)
161{
162 BlockIOLimit *io_limits = &bs->io_limits;
163 return io_limits->bps[BLOCK_IO_LIMIT_READ]
164 || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
165 || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
166 || io_limits->iops[BLOCK_IO_LIMIT_READ]
167 || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
168 || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
169}
170
98f90dba
ZYW
171static void bdrv_io_limits_intercept(BlockDriverState *bs,
172 bool is_write, int nb_sectors)
173{
174 int64_t wait_time = -1;
175
176 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
177 qemu_co_queue_wait(&bs->throttled_reqs);
178 }
179
180 /* In fact, we hope to keep each request's timing, in FIFO mode. The next
181 * throttled requests will not be dequeued until the current request is
182 * allowed to be serviced. So if the current request still exceeds the
183 * limits, it will be inserted to the head. All requests followed it will
184 * be still in throttled_reqs queue.
185 */
186
187 while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
188 qemu_mod_timer(bs->block_timer,
189 wait_time + qemu_get_clock_ns(vm_clock));
190 qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
191 }
192
193 qemu_co_queue_next(&bs->throttled_reqs);
194}
195
9e0b22f4
SH
196/* check if the path starts with "<protocol>:" */
197static int path_has_protocol(const char *path)
198{
199#ifdef _WIN32
200 if (is_windows_drive(path) ||
201 is_windows_drive_prefix(path)) {
202 return 0;
203 }
204#endif
205
206 return strchr(path, ':') != NULL;
207}
208
83f64091 209int path_is_absolute(const char *path)
3b0d4f61 210{
83f64091 211 const char *p;
21664424
FB
212#ifdef _WIN32
213 /* specific case for names like: "\\.\d:" */
214 if (*path == '/' || *path == '\\')
215 return 1;
216#endif
83f64091
FB
217 p = strchr(path, ':');
218 if (p)
219 p++;
220 else
221 p = path;
3b9f94e1
FB
222#ifdef _WIN32
223 return (*p == '/' || *p == '\\');
224#else
225 return (*p == '/');
226#endif
3b0d4f61
FB
227}
228
83f64091
FB
229/* if filename is absolute, just copy it to dest. Otherwise, build a
230 path to it by considering it is relative to base_path. URL are
231 supported. */
232void path_combine(char *dest, int dest_size,
233 const char *base_path,
234 const char *filename)
3b0d4f61 235{
83f64091
FB
236 const char *p, *p1;
237 int len;
238
239 if (dest_size <= 0)
240 return;
241 if (path_is_absolute(filename)) {
242 pstrcpy(dest, dest_size, filename);
243 } else {
244 p = strchr(base_path, ':');
245 if (p)
246 p++;
247 else
248 p = base_path;
3b9f94e1
FB
249 p1 = strrchr(base_path, '/');
250#ifdef _WIN32
251 {
252 const char *p2;
253 p2 = strrchr(base_path, '\\');
254 if (!p1 || p2 > p1)
255 p1 = p2;
256 }
257#endif
83f64091
FB
258 if (p1)
259 p1++;
260 else
261 p1 = base_path;
262 if (p1 > p)
263 p = p1;
264 len = p - base_path;
265 if (len > dest_size - 1)
266 len = dest_size - 1;
267 memcpy(dest, base_path, len);
268 dest[len] = '\0';
269 pstrcat(dest, dest_size, filename);
3b0d4f61 270 }
3b0d4f61
FB
271}
272
5efa9d5a 273void bdrv_register(BlockDriver *bdrv)
ea2384d3 274{
8c5873d6
SH
275 /* Block drivers without coroutine functions need emulation */
276 if (!bdrv->bdrv_co_readv) {
f9f05dc5
KW
277 bdrv->bdrv_co_readv = bdrv_co_readv_em;
278 bdrv->bdrv_co_writev = bdrv_co_writev_em;
279
f8c35c1d
SH
280 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
281 * the block driver lacks aio we need to emulate that too.
282 */
f9f05dc5
KW
283 if (!bdrv->bdrv_aio_readv) {
284 /* add AIO emulation layer */
285 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
286 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
f9f05dc5 287 }
83f64091 288 }
b2e12bc6 289
8a22f02a 290 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
ea2384d3 291}
b338082b
FB
292
293/* create a new block device (by default it is empty) */
294BlockDriverState *bdrv_new(const char *device_name)
295{
1b7bdbc1 296 BlockDriverState *bs;
b338082b 297
7267c094 298 bs = g_malloc0(sizeof(BlockDriverState));
b338082b 299 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
ea2384d3 300 if (device_name[0] != '\0') {
1b7bdbc1 301 QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
ea2384d3 302 }
28a7282a 303 bdrv_iostatus_disable(bs);
b338082b
FB
304 return bs;
305}
306
ea2384d3
FB
307BlockDriver *bdrv_find_format(const char *format_name)
308{
309 BlockDriver *drv1;
8a22f02a
SH
310 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
311 if (!strcmp(drv1->format_name, format_name)) {
ea2384d3 312 return drv1;
8a22f02a 313 }
ea2384d3
FB
314 }
315 return NULL;
316}
317
eb852011
MA
318static int bdrv_is_whitelisted(BlockDriver *drv)
319{
320 static const char *whitelist[] = {
321 CONFIG_BDRV_WHITELIST
322 };
323 const char **p;
324
325 if (!whitelist[0])
326 return 1; /* no whitelist, anything goes */
327
328 for (p = whitelist; *p; p++) {
329 if (!strcmp(drv->format_name, *p)) {
330 return 1;
331 }
332 }
333 return 0;
334}
335
336BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
337{
338 BlockDriver *drv = bdrv_find_format(format_name);
339 return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
340}
341
0e7e1989
KW
342int bdrv_create(BlockDriver *drv, const char* filename,
343 QEMUOptionParameter *options)
ea2384d3
FB
344{
345 if (!drv->bdrv_create)
346 return -ENOTSUP;
0e7e1989
KW
347
348 return drv->bdrv_create(filename, options);
ea2384d3
FB
349}
350
84a12e66
CH
351int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
352{
353 BlockDriver *drv;
354
b50cbabc 355 drv = bdrv_find_protocol(filename);
84a12e66 356 if (drv == NULL) {
16905d71 357 return -ENOENT;
84a12e66
CH
358 }
359
360 return bdrv_create(drv, filename, options);
361}
362
d5249393 363#ifdef _WIN32
95389c86 364void get_tmp_filename(char *filename, int size)
d5249393 365{
3b9f94e1 366 char temp_dir[MAX_PATH];
3b46e624 367
3b9f94e1
FB
368 GetTempPath(MAX_PATH, temp_dir);
369 GetTempFileName(temp_dir, "qem", 0, filename);
d5249393
FB
370}
371#else
95389c86 372void get_tmp_filename(char *filename, int size)
fc01f7e7 373{
67b915a5 374 int fd;
7ccfb2eb 375 const char *tmpdir;
d5249393 376 /* XXX: race condition possible */
0badc1ee
AJ
377 tmpdir = getenv("TMPDIR");
378 if (!tmpdir)
379 tmpdir = "/tmp";
380 snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
ea2384d3
FB
381 fd = mkstemp(filename);
382 close(fd);
383}
d5249393 384#endif
fc01f7e7 385
84a12e66
CH
386/*
387 * Detect host devices. By convention, /dev/cdrom[N] is always
388 * recognized as a host CDROM.
389 */
390static BlockDriver *find_hdev_driver(const char *filename)
391{
392 int score_max = 0, score;
393 BlockDriver *drv = NULL, *d;
394
395 QLIST_FOREACH(d, &bdrv_drivers, list) {
396 if (d->bdrv_probe_device) {
397 score = d->bdrv_probe_device(filename);
398 if (score > score_max) {
399 score_max = score;
400 drv = d;
401 }
402 }
403 }
404
405 return drv;
406}
407
b50cbabc 408BlockDriver *bdrv_find_protocol(const char *filename)
83f64091
FB
409{
410 BlockDriver *drv1;
411 char protocol[128];
1cec71e3 412 int len;
83f64091 413 const char *p;
19cb3738 414
66f82cee
KW
415 /* TODO Drivers without bdrv_file_open must be specified explicitly */
416
39508e7a
CH
417 /*
418 * XXX(hch): we really should not let host device detection
419 * override an explicit protocol specification, but moving this
420 * later breaks access to device names with colons in them.
421 * Thanks to the brain-dead persistent naming schemes on udev-
422 * based Linux systems those actually are quite common.
423 */
424 drv1 = find_hdev_driver(filename);
425 if (drv1) {
426 return drv1;
427 }
428
9e0b22f4 429 if (!path_has_protocol(filename)) {
39508e7a 430 return bdrv_find_format("file");
84a12e66 431 }
9e0b22f4
SH
432 p = strchr(filename, ':');
433 assert(p != NULL);
1cec71e3
AL
434 len = p - filename;
435 if (len > sizeof(protocol) - 1)
436 len = sizeof(protocol) - 1;
437 memcpy(protocol, filename, len);
438 protocol[len] = '\0';
8a22f02a 439 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
5fafdf24 440 if (drv1->protocol_name &&
8a22f02a 441 !strcmp(drv1->protocol_name, protocol)) {
83f64091 442 return drv1;
8a22f02a 443 }
83f64091
FB
444 }
445 return NULL;
446}
447
c98ac35d 448static int find_image_format(const char *filename, BlockDriver **pdrv)
f3a5d3f8
CH
449{
450 int ret, score, score_max;
451 BlockDriver *drv1, *drv;
452 uint8_t buf[2048];
453 BlockDriverState *bs;
454
f5edb014 455 ret = bdrv_file_open(&bs, filename, 0);
c98ac35d
SW
456 if (ret < 0) {
457 *pdrv = NULL;
458 return ret;
459 }
f8ea0b00 460
08a00559
KW
461 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
462 if (bs->sg || !bdrv_is_inserted(bs)) {
1a396859 463 bdrv_delete(bs);
c98ac35d
SW
464 drv = bdrv_find_format("raw");
465 if (!drv) {
466 ret = -ENOENT;
467 }
468 *pdrv = drv;
469 return ret;
1a396859 470 }
f8ea0b00 471
83f64091
FB
472 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
473 bdrv_delete(bs);
474 if (ret < 0) {
c98ac35d
SW
475 *pdrv = NULL;
476 return ret;
83f64091
FB
477 }
478
ea2384d3 479 score_max = 0;
84a12e66 480 drv = NULL;
8a22f02a 481 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
83f64091
FB
482 if (drv1->bdrv_probe) {
483 score = drv1->bdrv_probe(buf, ret, filename);
484 if (score > score_max) {
485 score_max = score;
486 drv = drv1;
487 }
0849bf08 488 }
fc01f7e7 489 }
c98ac35d
SW
490 if (!drv) {
491 ret = -ENOENT;
492 }
493 *pdrv = drv;
494 return ret;
ea2384d3
FB
495}
496
51762288
SH
497/**
498 * Set the current 'total_sectors' value
499 */
500static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
501{
502 BlockDriver *drv = bs->drv;
503
396759ad
NB
504 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
505 if (bs->sg)
506 return 0;
507
51762288
SH
508 /* query actual device if possible, otherwise just trust the hint */
509 if (drv->bdrv_getlength) {
510 int64_t length = drv->bdrv_getlength(bs);
511 if (length < 0) {
512 return length;
513 }
514 hint = length >> BDRV_SECTOR_BITS;
515 }
516
517 bs->total_sectors = hint;
518 return 0;
519}
520
c3993cdc
SH
521/**
522 * Set open flags for a given cache mode
523 *
524 * Return 0 on success, -1 if the cache mode was invalid.
525 */
526int bdrv_parse_cache_flags(const char *mode, int *flags)
527{
528 *flags &= ~BDRV_O_CACHE_MASK;
529
530 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
531 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
92196b2f
SH
532 } else if (!strcmp(mode, "directsync")) {
533 *flags |= BDRV_O_NOCACHE;
c3993cdc
SH
534 } else if (!strcmp(mode, "writeback")) {
535 *flags |= BDRV_O_CACHE_WB;
536 } else if (!strcmp(mode, "unsafe")) {
537 *flags |= BDRV_O_CACHE_WB;
538 *flags |= BDRV_O_NO_FLUSH;
539 } else if (!strcmp(mode, "writethrough")) {
540 /* this is the default */
541 } else {
542 return -1;
543 }
544
545 return 0;
546}
547
53fec9d3
SH
548/**
549 * The copy-on-read flag is actually a reference count so multiple users may
550 * use the feature without worrying about clobbering its previous state.
551 * Copy-on-read stays enabled until all users have called to disable it.
552 */
553void bdrv_enable_copy_on_read(BlockDriverState *bs)
554{
555 bs->copy_on_read++;
556}
557
558void bdrv_disable_copy_on_read(BlockDriverState *bs)
559{
560 assert(bs->copy_on_read > 0);
561 bs->copy_on_read--;
562}
563
57915332
KW
564/*
565 * Common part for opening disk images and files
566 */
567static int bdrv_open_common(BlockDriverState *bs, const char *filename,
568 int flags, BlockDriver *drv)
569{
570 int ret, open_flags;
571
572 assert(drv != NULL);
573
28dcee10
SH
574 trace_bdrv_open_common(bs, filename, flags, drv->format_name);
575
66f82cee 576 bs->file = NULL;
51762288 577 bs->total_sectors = 0;
57915332
KW
578 bs->encrypted = 0;
579 bs->valid_key = 0;
03f541bd 580 bs->sg = 0;
57915332 581 bs->open_flags = flags;
03f541bd 582 bs->growable = 0;
57915332
KW
583 bs->buffer_alignment = 512;
584
53fec9d3
SH
585 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
586 if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
587 bdrv_enable_copy_on_read(bs);
588 }
589
57915332 590 pstrcpy(bs->filename, sizeof(bs->filename), filename);
03f541bd 591 bs->backing_file[0] = '\0';
57915332
KW
592
593 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
594 return -ENOTSUP;
595 }
596
597 bs->drv = drv;
7267c094 598 bs->opaque = g_malloc0(drv->instance_size);
57915332 599
03f541bd 600 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
57915332
KW
601
602 /*
603 * Clear flags that are internal to the block layer before opening the
604 * image.
605 */
606 open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
607
608 /*
ebabb67a 609 * Snapshots should be writable.
57915332
KW
610 */
611 if (bs->is_temporary) {
612 open_flags |= BDRV_O_RDWR;
613 }
614
e7c63796
SH
615 bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
616
66f82cee
KW
617 /* Open the image, either directly or using a protocol */
618 if (drv->bdrv_file_open) {
619 ret = drv->bdrv_file_open(bs, filename, open_flags);
620 } else {
621 ret = bdrv_file_open(&bs->file, filename, open_flags);
622 if (ret >= 0) {
623 ret = drv->bdrv_open(bs, open_flags);
624 }
625 }
626
57915332
KW
627 if (ret < 0) {
628 goto free_and_fail;
629 }
630
51762288
SH
631 ret = refresh_total_sectors(bs, bs->total_sectors);
632 if (ret < 0) {
633 goto free_and_fail;
57915332 634 }
51762288 635
57915332
KW
636#ifndef _WIN32
637 if (bs->is_temporary) {
638 unlink(filename);
639 }
640#endif
641 return 0;
642
643free_and_fail:
66f82cee
KW
644 if (bs->file) {
645 bdrv_delete(bs->file);
646 bs->file = NULL;
647 }
7267c094 648 g_free(bs->opaque);
57915332
KW
649 bs->opaque = NULL;
650 bs->drv = NULL;
651 return ret;
652}
653
b6ce07aa
KW
654/*
655 * Opens a file using a protocol (file, host_device, nbd, ...)
656 */
83f64091 657int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
ea2384d3 658{
83f64091 659 BlockDriverState *bs;
6db95603 660 BlockDriver *drv;
83f64091
FB
661 int ret;
662
b50cbabc 663 drv = bdrv_find_protocol(filename);
6db95603
CH
664 if (!drv) {
665 return -ENOENT;
666 }
667
83f64091 668 bs = bdrv_new("");
b6ce07aa 669 ret = bdrv_open_common(bs, filename, flags, drv);
83f64091
FB
670 if (ret < 0) {
671 bdrv_delete(bs);
672 return ret;
3b0d4f61 673 }
71d0770c 674 bs->growable = 1;
83f64091
FB
675 *pbs = bs;
676 return 0;
677}
678
b6ce07aa
KW
679/*
680 * Opens a disk image (raw, qcow2, vmdk, ...)
681 */
d6e9098e
KW
682int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
683 BlockDriver *drv)
ea2384d3 684{
b6ce07aa 685 int ret;
2b572816 686 char tmp_filename[PATH_MAX];
712e7874 687
83f64091 688 if (flags & BDRV_O_SNAPSHOT) {
ea2384d3
FB
689 BlockDriverState *bs1;
690 int64_t total_size;
7c96d46e 691 int is_protocol = 0;
91a073a9
KW
692 BlockDriver *bdrv_qcow2;
693 QEMUOptionParameter *options;
b6ce07aa 694 char backing_filename[PATH_MAX];
3b46e624 695
ea2384d3
FB
696 /* if snapshot, we create a temporary backing file and open it
697 instead of opening 'filename' directly */
33e3963e 698
ea2384d3
FB
699 /* if there is a backing file, use it */
700 bs1 = bdrv_new("");
d6e9098e 701 ret = bdrv_open(bs1, filename, 0, drv);
51d7c00c 702 if (ret < 0) {
ea2384d3 703 bdrv_delete(bs1);
51d7c00c 704 return ret;
ea2384d3 705 }
3e82990b 706 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
7c96d46e
AL
707
708 if (bs1->drv && bs1->drv->protocol_name)
709 is_protocol = 1;
710
ea2384d3 711 bdrv_delete(bs1);
3b46e624 712
ea2384d3 713 get_tmp_filename(tmp_filename, sizeof(tmp_filename));
7c96d46e
AL
714
715 /* Real path is meaningless for protocols */
716 if (is_protocol)
717 snprintf(backing_filename, sizeof(backing_filename),
718 "%s", filename);
114cdfa9
KS
719 else if (!realpath(filename, backing_filename))
720 return -errno;
7c96d46e 721
91a073a9
KW
722 bdrv_qcow2 = bdrv_find_format("qcow2");
723 options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
724
3e82990b 725 set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
91a073a9
KW
726 set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
727 if (drv) {
728 set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
729 drv->format_name);
730 }
731
732 ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
d748768c 733 free_option_parameters(options);
51d7c00c
AL
734 if (ret < 0) {
735 return ret;
ea2384d3 736 }
91a073a9 737
ea2384d3 738 filename = tmp_filename;
91a073a9 739 drv = bdrv_qcow2;
ea2384d3
FB
740 bs->is_temporary = 1;
741 }
712e7874 742
b6ce07aa 743 /* Find the right image format driver */
6db95603 744 if (!drv) {
c98ac35d 745 ret = find_image_format(filename, &drv);
51d7c00c 746 }
6987307c 747
51d7c00c 748 if (!drv) {
51d7c00c 749 goto unlink_and_fail;
ea2384d3 750 }
b6ce07aa
KW
751
752 /* Open the image */
753 ret = bdrv_open_common(bs, filename, flags, drv);
754 if (ret < 0) {
6987307c
CH
755 goto unlink_and_fail;
756 }
757
b6ce07aa
KW
758 /* If there is a backing file, use it */
759 if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
760 char backing_filename[PATH_MAX];
761 int back_flags;
762 BlockDriver *back_drv = NULL;
763
764 bs->backing_hd = bdrv_new("");
df2dbb4a
SH
765
766 if (path_has_protocol(bs->backing_file)) {
767 pstrcpy(backing_filename, sizeof(backing_filename),
768 bs->backing_file);
769 } else {
770 path_combine(backing_filename, sizeof(backing_filename),
771 filename, bs->backing_file);
772 }
773
774 if (bs->backing_format[0] != '\0') {
b6ce07aa 775 back_drv = bdrv_find_format(bs->backing_format);
df2dbb4a 776 }
b6ce07aa
KW
777
778 /* backing files always opened read-only */
779 back_flags =
780 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
781
782 ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
783 if (ret < 0) {
784 bdrv_close(bs);
785 return ret;
786 }
787 if (bs->is_temporary) {
788 bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
789 } else {
790 /* base image inherits from "parent" */
791 bs->backing_hd->keep_read_only = bs->keep_read_only;
792 }
793 }
794
795 if (!bdrv_key_required(bs)) {
7d4b4ba5 796 bdrv_dev_change_media_cb(bs, true);
b6ce07aa
KW
797 }
798
98f90dba
ZYW
799 /* throttling disk I/O limits */
800 if (bs->io_limits_enabled) {
801 bdrv_io_limits_enable(bs);
802 }
803
b6ce07aa
KW
804 return 0;
805
806unlink_and_fail:
807 if (bs->is_temporary) {
808 unlink(filename);
809 }
810 return ret;
811}
812
fc01f7e7
FB
813void bdrv_close(BlockDriverState *bs)
814{
80ccf93b 815 bdrv_flush(bs);
19cb3738 816 if (bs->drv) {
3e914655
PB
817 if (bs->job) {
818 block_job_cancel_sync(bs->job);
819 }
7094f12f
KW
820 bdrv_drain_all();
821
f9092b10
MA
822 if (bs == bs_snapshots) {
823 bs_snapshots = NULL;
824 }
557df6ac 825 if (bs->backing_hd) {
ea2384d3 826 bdrv_delete(bs->backing_hd);
557df6ac
SH
827 bs->backing_hd = NULL;
828 }
ea2384d3 829 bs->drv->bdrv_close(bs);
7267c094 830 g_free(bs->opaque);
ea2384d3
FB
831#ifdef _WIN32
832 if (bs->is_temporary) {
833 unlink(bs->filename);
834 }
67b915a5 835#endif
ea2384d3
FB
836 bs->opaque = NULL;
837 bs->drv = NULL;
53fec9d3 838 bs->copy_on_read = 0;
b338082b 839
66f82cee
KW
840 if (bs->file != NULL) {
841 bdrv_close(bs->file);
842 }
843
7d4b4ba5 844 bdrv_dev_change_media_cb(bs, false);
b338082b 845 }
98f90dba
ZYW
846
847 /*throttling disk I/O limits*/
848 if (bs->io_limits_enabled) {
849 bdrv_io_limits_disable(bs);
850 }
b338082b
FB
851}
852
2bc93fed
MK
853void bdrv_close_all(void)
854{
855 BlockDriverState *bs;
856
857 QTAILQ_FOREACH(bs, &bdrv_states, list) {
858 bdrv_close(bs);
859 }
860}
861
922453bc
SH
862/*
863 * Wait for pending requests to complete across all BlockDriverStates
864 *
865 * This function does not flush data to disk, use bdrv_flush_all() for that
866 * after calling this function.
867 */
868void bdrv_drain_all(void)
869{
870 BlockDriverState *bs;
871
872 qemu_aio_flush();
873
874 /* If requests are still pending there is a bug somewhere */
875 QTAILQ_FOREACH(bs, &bdrv_states, list) {
876 assert(QLIST_EMPTY(&bs->tracked_requests));
877 assert(qemu_co_queue_empty(&bs->throttled_reqs));
878 }
879}
880
d22b2f41
RH
881/* make a BlockDriverState anonymous by removing from bdrv_state list.
882 Also, NULL terminate the device_name to prevent double remove */
883void bdrv_make_anon(BlockDriverState *bs)
884{
885 if (bs->device_name[0] != '\0') {
886 QTAILQ_REMOVE(&bdrv_states, bs, list);
887 }
888 bs->device_name[0] = '\0';
889}
890
8802d1fd
JC
891/*
892 * Add new bs contents at the top of an image chain while the chain is
893 * live, while keeping required fields on the top layer.
894 *
895 * This will modify the BlockDriverState fields, and swap contents
896 * between bs_new and bs_top. Both bs_new and bs_top are modified.
897 *
f6801b83
JC
898 * bs_new is required to be anonymous.
899 *
8802d1fd
JC
900 * This function does not create any image files.
901 */
902void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
903{
904 BlockDriverState tmp;
905
f6801b83
JC
906 /* bs_new must be anonymous */
907 assert(bs_new->device_name[0] == '\0');
8802d1fd
JC
908
909 tmp = *bs_new;
910
911 /* there are some fields that need to stay on the top layer: */
912
913 /* dev info */
914 tmp.dev_ops = bs_top->dev_ops;
915 tmp.dev_opaque = bs_top->dev_opaque;
916 tmp.dev = bs_top->dev;
917 tmp.buffer_alignment = bs_top->buffer_alignment;
918 tmp.copy_on_read = bs_top->copy_on_read;
919
920 /* i/o timing parameters */
921 tmp.slice_time = bs_top->slice_time;
922 tmp.slice_start = bs_top->slice_start;
923 tmp.slice_end = bs_top->slice_end;
924 tmp.io_limits = bs_top->io_limits;
925 tmp.io_base = bs_top->io_base;
926 tmp.throttled_reqs = bs_top->throttled_reqs;
927 tmp.block_timer = bs_top->block_timer;
928 tmp.io_limits_enabled = bs_top->io_limits_enabled;
929
930 /* geometry */
931 tmp.cyls = bs_top->cyls;
932 tmp.heads = bs_top->heads;
933 tmp.secs = bs_top->secs;
934 tmp.translation = bs_top->translation;
935
936 /* r/w error */
937 tmp.on_read_error = bs_top->on_read_error;
938 tmp.on_write_error = bs_top->on_write_error;
939
940 /* i/o status */
941 tmp.iostatus_enabled = bs_top->iostatus_enabled;
942 tmp.iostatus = bs_top->iostatus;
943
944 /* keep the same entry in bdrv_states */
945 pstrcpy(tmp.device_name, sizeof(tmp.device_name), bs_top->device_name);
946 tmp.list = bs_top->list;
947
948 /* The contents of 'tmp' will become bs_top, as we are
949 * swapping bs_new and bs_top contents. */
950 tmp.backing_hd = bs_new;
951 pstrcpy(tmp.backing_file, sizeof(tmp.backing_file), bs_top->filename);
f6801b83 952 bdrv_get_format(bs_top, tmp.backing_format, sizeof(tmp.backing_format));
8802d1fd
JC
953
954 /* swap contents of the fixed new bs and the current top */
955 *bs_new = *bs_top;
956 *bs_top = tmp;
957
f6801b83
JC
958 /* device_name[] was carried over from the old bs_top. bs_new
959 * shouldn't be in bdrv_states, so we need to make device_name[]
960 * reflect the anonymity of bs_new
961 */
962 bs_new->device_name[0] = '\0';
963
8802d1fd
JC
964 /* clear the copied fields in the new backing file */
965 bdrv_detach_dev(bs_new, bs_new->dev);
966
967 qemu_co_queue_init(&bs_new->throttled_reqs);
968 memset(&bs_new->io_base, 0, sizeof(bs_new->io_base));
969 memset(&bs_new->io_limits, 0, sizeof(bs_new->io_limits));
970 bdrv_iostatus_disable(bs_new);
971
972 /* we don't use bdrv_io_limits_disable() for this, because we don't want
973 * to affect or delete the block_timer, as it has been moved to bs_top */
974 bs_new->io_limits_enabled = false;
975 bs_new->block_timer = NULL;
976 bs_new->slice_time = 0;
977 bs_new->slice_start = 0;
978 bs_new->slice_end = 0;
979}
980
b338082b
FB
981void bdrv_delete(BlockDriverState *bs)
982{
fa879d62 983 assert(!bs->dev);
3e914655
PB
984 assert(!bs->job);
985 assert(!bs->in_use);
18846dee 986
1b7bdbc1 987 /* remove from list, if necessary */
d22b2f41 988 bdrv_make_anon(bs);
34c6f050 989
b338082b 990 bdrv_close(bs);
66f82cee
KW
991 if (bs->file != NULL) {
992 bdrv_delete(bs->file);
993 }
994
f9092b10 995 assert(bs != bs_snapshots);
7267c094 996 g_free(bs);
fc01f7e7
FB
997}
998
fa879d62
MA
999int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1000/* TODO change to DeviceState *dev when all users are qdevified */
18846dee 1001{
fa879d62 1002 if (bs->dev) {
18846dee
MA
1003 return -EBUSY;
1004 }
fa879d62 1005 bs->dev = dev;
28a7282a 1006 bdrv_iostatus_reset(bs);
18846dee
MA
1007 return 0;
1008}
1009
fa879d62
MA
1010/* TODO qdevified devices don't use this, remove when devices are qdevified */
1011void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
18846dee 1012{
fa879d62
MA
1013 if (bdrv_attach_dev(bs, dev) < 0) {
1014 abort();
1015 }
1016}
1017
1018void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1019/* TODO change to DeviceState *dev when all users are qdevified */
1020{
1021 assert(bs->dev == dev);
1022 bs->dev = NULL;
0e49de52
MA
1023 bs->dev_ops = NULL;
1024 bs->dev_opaque = NULL;
29e05f20 1025 bs->buffer_alignment = 512;
18846dee
MA
1026}
1027
fa879d62
MA
1028/* TODO change to return DeviceState * when all users are qdevified */
1029void *bdrv_get_attached_dev(BlockDriverState *bs)
18846dee 1030{
fa879d62 1031 return bs->dev;
18846dee
MA
1032}
1033
0e49de52
MA
1034void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1035 void *opaque)
1036{
1037 bs->dev_ops = ops;
1038 bs->dev_opaque = opaque;
2c6942fa
MA
1039 if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1040 bs_snapshots = NULL;
1041 }
0e49de52
MA
1042}
1043
329c0a48
LC
1044void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1045 BlockQMPEventAction action, int is_read)
1046{
1047 QObject *data;
1048 const char *action_str;
1049
1050 switch (action) {
1051 case BDRV_ACTION_REPORT:
1052 action_str = "report";
1053 break;
1054 case BDRV_ACTION_IGNORE:
1055 action_str = "ignore";
1056 break;
1057 case BDRV_ACTION_STOP:
1058 action_str = "stop";
1059 break;
1060 default:
1061 abort();
1062 }
1063
1064 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1065 bdrv->device_name,
1066 action_str,
1067 is_read ? "read" : "write");
1068 monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1069
1070 qobject_decref(data);
1071}
1072
6f382ed2
LC
1073static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1074{
1075 QObject *data;
1076
1077 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1078 bdrv_get_device_name(bs), ejected);
1079 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1080
1081 qobject_decref(data);
1082}
1083
7d4b4ba5 1084static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
0e49de52 1085{
145feb17 1086 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
6f382ed2 1087 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
7d4b4ba5 1088 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
6f382ed2
LC
1089 if (tray_was_closed) {
1090 /* tray open */
1091 bdrv_emit_qmp_eject_event(bs, true);
1092 }
1093 if (load) {
1094 /* tray close */
1095 bdrv_emit_qmp_eject_event(bs, false);
1096 }
145feb17
MA
1097 }
1098}
1099
2c6942fa
MA
1100bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1101{
1102 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1103}
1104
025ccaa7
PB
1105void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1106{
1107 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1108 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1109 }
1110}
1111
e4def80b
MA
1112bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1113{
1114 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1115 return bs->dev_ops->is_tray_open(bs->dev_opaque);
1116 }
1117 return false;
1118}
1119
145feb17
MA
1120static void bdrv_dev_resize_cb(BlockDriverState *bs)
1121{
1122 if (bs->dev_ops && bs->dev_ops->resize_cb) {
1123 bs->dev_ops->resize_cb(bs->dev_opaque);
0e49de52
MA
1124 }
1125}
1126
f107639a
MA
1127bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1128{
1129 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1130 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1131 }
1132 return false;
1133}
1134
e97fc193
AL
1135/*
1136 * Run consistency checks on an image
1137 *
e076f338 1138 * Returns 0 if the check could be completed (it doesn't mean that the image is
a1c7273b 1139 * free of errors) or -errno when an internal error occurred. The results of the
e076f338 1140 * check are stored in res.
e97fc193 1141 */
e076f338 1142int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
e97fc193
AL
1143{
1144 if (bs->drv->bdrv_check == NULL) {
1145 return -ENOTSUP;
1146 }
1147
e076f338 1148 memset(res, 0, sizeof(*res));
9ac228e0 1149 return bs->drv->bdrv_check(bs, res);
e97fc193
AL
1150}
1151
8a426614
KW
1152#define COMMIT_BUF_SECTORS 2048
1153
33e3963e
FB
1154/* commit COW file into the raw image */
1155int bdrv_commit(BlockDriverState *bs)
1156{
19cb3738 1157 BlockDriver *drv = bs->drv;
ee181196 1158 BlockDriver *backing_drv;
8a426614
KW
1159 int64_t sector, total_sectors;
1160 int n, ro, open_flags;
4dca4b63 1161 int ret = 0, rw_ret = 0;
8a426614 1162 uint8_t *buf;
4dca4b63
NS
1163 char filename[1024];
1164 BlockDriverState *bs_rw, *bs_ro;
33e3963e 1165
19cb3738
FB
1166 if (!drv)
1167 return -ENOMEDIUM;
4dca4b63
NS
1168
1169 if (!bs->backing_hd) {
1170 return -ENOTSUP;
33e3963e
FB
1171 }
1172
4dca4b63
NS
1173 if (bs->backing_hd->keep_read_only) {
1174 return -EACCES;
1175 }
ee181196 1176
2d3735d3
SH
1177 if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1178 return -EBUSY;
1179 }
1180
ee181196 1181 backing_drv = bs->backing_hd->drv;
4dca4b63
NS
1182 ro = bs->backing_hd->read_only;
1183 strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1184 open_flags = bs->backing_hd->open_flags;
1185
1186 if (ro) {
1187 /* re-open as RW */
1188 bdrv_delete(bs->backing_hd);
1189 bs->backing_hd = NULL;
1190 bs_rw = bdrv_new("");
ee181196
KW
1191 rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1192 backing_drv);
4dca4b63
NS
1193 if (rw_ret < 0) {
1194 bdrv_delete(bs_rw);
1195 /* try to re-open read-only */
1196 bs_ro = bdrv_new("");
ee181196
KW
1197 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1198 backing_drv);
4dca4b63
NS
1199 if (ret < 0) {
1200 bdrv_delete(bs_ro);
1201 /* drive not functional anymore */
1202 bs->drv = NULL;
1203 return ret;
1204 }
1205 bs->backing_hd = bs_ro;
1206 return rw_ret;
1207 }
1208 bs->backing_hd = bs_rw;
ea2384d3 1209 }
33e3963e 1210
6ea44308 1211 total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
7267c094 1212 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
8a426614
KW
1213
1214 for (sector = 0; sector < total_sectors; sector += n) {
05c4af54 1215 if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
8a426614
KW
1216
1217 if (bdrv_read(bs, sector, buf, n) != 0) {
1218 ret = -EIO;
1219 goto ro_cleanup;
1220 }
1221
1222 if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1223 ret = -EIO;
1224 goto ro_cleanup;
1225 }
ea2384d3 1226 }
33e3963e 1227 }
95389c86 1228
1d44952f
CH
1229 if (drv->bdrv_make_empty) {
1230 ret = drv->bdrv_make_empty(bs);
1231 bdrv_flush(bs);
1232 }
95389c86 1233
3f5075ae
CH
1234 /*
1235 * Make sure all data we wrote to the backing device is actually
1236 * stable on disk.
1237 */
1238 if (bs->backing_hd)
1239 bdrv_flush(bs->backing_hd);
4dca4b63
NS
1240
1241ro_cleanup:
7267c094 1242 g_free(buf);
4dca4b63
NS
1243
1244 if (ro) {
1245 /* re-open as RO */
1246 bdrv_delete(bs->backing_hd);
1247 bs->backing_hd = NULL;
1248 bs_ro = bdrv_new("");
ee181196
KW
1249 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1250 backing_drv);
4dca4b63
NS
1251 if (ret < 0) {
1252 bdrv_delete(bs_ro);
1253 /* drive not functional anymore */
1254 bs->drv = NULL;
1255 return ret;
1256 }
1257 bs->backing_hd = bs_ro;
1258 bs->backing_hd->keep_read_only = 0;
1259 }
1260
1d44952f 1261 return ret;
33e3963e
FB
1262}
1263
e8877497 1264int bdrv_commit_all(void)
6ab4b5ab
MA
1265{
1266 BlockDriverState *bs;
1267
1268 QTAILQ_FOREACH(bs, &bdrv_states, list) {
e8877497
SH
1269 int ret = bdrv_commit(bs);
1270 if (ret < 0) {
1271 return ret;
1272 }
6ab4b5ab 1273 }
e8877497 1274 return 0;
6ab4b5ab
MA
1275}
1276
dbffbdcf
SH
1277struct BdrvTrackedRequest {
1278 BlockDriverState *bs;
1279 int64_t sector_num;
1280 int nb_sectors;
1281 bool is_write;
1282 QLIST_ENTRY(BdrvTrackedRequest) list;
5f8b6491 1283 Coroutine *co; /* owner, used for deadlock detection */
f4658285 1284 CoQueue wait_queue; /* coroutines blocked on this request */
dbffbdcf
SH
1285};
1286
1287/**
1288 * Remove an active request from the tracked requests list
1289 *
1290 * This function should be called when a tracked request is completing.
1291 */
1292static void tracked_request_end(BdrvTrackedRequest *req)
1293{
1294 QLIST_REMOVE(req, list);
f4658285 1295 qemu_co_queue_restart_all(&req->wait_queue);
dbffbdcf
SH
1296}
1297
1298/**
1299 * Add an active request to the tracked requests list
1300 */
1301static void tracked_request_begin(BdrvTrackedRequest *req,
1302 BlockDriverState *bs,
1303 int64_t sector_num,
1304 int nb_sectors, bool is_write)
1305{
1306 *req = (BdrvTrackedRequest){
1307 .bs = bs,
1308 .sector_num = sector_num,
1309 .nb_sectors = nb_sectors,
1310 .is_write = is_write,
5f8b6491 1311 .co = qemu_coroutine_self(),
dbffbdcf
SH
1312 };
1313
f4658285
SH
1314 qemu_co_queue_init(&req->wait_queue);
1315
dbffbdcf
SH
1316 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1317}
1318
d83947ac
SH
1319/**
1320 * Round a region to cluster boundaries
1321 */
1322static void round_to_clusters(BlockDriverState *bs,
1323 int64_t sector_num, int nb_sectors,
1324 int64_t *cluster_sector_num,
1325 int *cluster_nb_sectors)
1326{
1327 BlockDriverInfo bdi;
1328
1329 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1330 *cluster_sector_num = sector_num;
1331 *cluster_nb_sectors = nb_sectors;
1332 } else {
1333 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1334 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1335 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1336 nb_sectors, c);
1337 }
1338}
1339
f4658285
SH
1340static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1341 int64_t sector_num, int nb_sectors) {
d83947ac
SH
1342 /* aaaa bbbb */
1343 if (sector_num >= req->sector_num + req->nb_sectors) {
1344 return false;
1345 }
1346 /* bbbb aaaa */
1347 if (req->sector_num >= sector_num + nb_sectors) {
1348 return false;
1349 }
1350 return true;
f4658285
SH
1351}
1352
1353static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1354 int64_t sector_num, int nb_sectors)
1355{
1356 BdrvTrackedRequest *req;
d83947ac
SH
1357 int64_t cluster_sector_num;
1358 int cluster_nb_sectors;
f4658285
SH
1359 bool retry;
1360
d83947ac
SH
1361 /* If we touch the same cluster it counts as an overlap. This guarantees
1362 * that allocating writes will be serialized and not race with each other
1363 * for the same cluster. For example, in copy-on-read it ensures that the
1364 * CoR read and write operations are atomic and guest writes cannot
1365 * interleave between them.
1366 */
1367 round_to_clusters(bs, sector_num, nb_sectors,
1368 &cluster_sector_num, &cluster_nb_sectors);
1369
f4658285
SH
1370 do {
1371 retry = false;
1372 QLIST_FOREACH(req, &bs->tracked_requests, list) {
d83947ac
SH
1373 if (tracked_request_overlaps(req, cluster_sector_num,
1374 cluster_nb_sectors)) {
5f8b6491
SH
1375 /* Hitting this means there was a reentrant request, for
1376 * example, a block driver issuing nested requests. This must
1377 * never happen since it means deadlock.
1378 */
1379 assert(qemu_coroutine_self() != req->co);
1380
f4658285
SH
1381 qemu_co_queue_wait(&req->wait_queue);
1382 retry = true;
1383 break;
1384 }
1385 }
1386 } while (retry);
1387}
1388
756e6736
KW
1389/*
1390 * Return values:
1391 * 0 - success
1392 * -EINVAL - backing format specified, but no file
1393 * -ENOSPC - can't update the backing file because no space is left in the
1394 * image file header
1395 * -ENOTSUP - format driver doesn't support changing the backing file
1396 */
1397int bdrv_change_backing_file(BlockDriverState *bs,
1398 const char *backing_file, const char *backing_fmt)
1399{
1400 BlockDriver *drv = bs->drv;
1401
1402 if (drv->bdrv_change_backing_file != NULL) {
1403 return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1404 } else {
1405 return -ENOTSUP;
1406 }
1407}
1408
71d0770c
AL
1409static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1410 size_t size)
1411{
1412 int64_t len;
1413
1414 if (!bdrv_is_inserted(bs))
1415 return -ENOMEDIUM;
1416
1417 if (bs->growable)
1418 return 0;
1419
1420 len = bdrv_getlength(bs);
1421
fbb7b4e0
KW
1422 if (offset < 0)
1423 return -EIO;
1424
1425 if ((offset > len) || (len - offset < size))
71d0770c
AL
1426 return -EIO;
1427
1428 return 0;
1429}
1430
1431static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1432 int nb_sectors)
1433{
eb5a3165
JS
1434 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1435 nb_sectors * BDRV_SECTOR_SIZE);
71d0770c
AL
1436}
1437
1c9805a3
SH
1438typedef struct RwCo {
1439 BlockDriverState *bs;
1440 int64_t sector_num;
1441 int nb_sectors;
1442 QEMUIOVector *qiov;
1443 bool is_write;
1444 int ret;
1445} RwCo;
1446
1447static void coroutine_fn bdrv_rw_co_entry(void *opaque)
fc01f7e7 1448{
1c9805a3 1449 RwCo *rwco = opaque;
ea2384d3 1450
1c9805a3
SH
1451 if (!rwco->is_write) {
1452 rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
470c0504 1453 rwco->nb_sectors, rwco->qiov, 0);
1c9805a3
SH
1454 } else {
1455 rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
f08f2dda 1456 rwco->nb_sectors, rwco->qiov, 0);
1c9805a3
SH
1457 }
1458}
e7a8a783 1459
1c9805a3
SH
1460/*
1461 * Process a synchronous request using coroutines
1462 */
1463static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1464 int nb_sectors, bool is_write)
1465{
1466 QEMUIOVector qiov;
1467 struct iovec iov = {
1468 .iov_base = (void *)buf,
1469 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1470 };
1471 Coroutine *co;
1472 RwCo rwco = {
1473 .bs = bs,
1474 .sector_num = sector_num,
1475 .nb_sectors = nb_sectors,
1476 .qiov = &qiov,
1477 .is_write = is_write,
1478 .ret = NOT_DONE,
1479 };
e7a8a783 1480
1c9805a3 1481 qemu_iovec_init_external(&qiov, &iov, 1);
e7a8a783 1482
498e386c
ZYW
1483 /**
1484 * In sync call context, when the vcpu is blocked, this throttling timer
1485 * will not fire; so the I/O throttling function has to be disabled here
1486 * if it has been enabled.
1487 */
1488 if (bs->io_limits_enabled) {
1489 fprintf(stderr, "Disabling I/O throttling on '%s' due "
1490 "to synchronous I/O.\n", bdrv_get_device_name(bs));
1491 bdrv_io_limits_disable(bs);
1492 }
1493
1c9805a3
SH
1494 if (qemu_in_coroutine()) {
1495 /* Fast-path if already in coroutine context */
1496 bdrv_rw_co_entry(&rwco);
1497 } else {
1498 co = qemu_coroutine_create(bdrv_rw_co_entry);
1499 qemu_coroutine_enter(co, &rwco);
1500 while (rwco.ret == NOT_DONE) {
1501 qemu_aio_wait();
1502 }
1503 }
1504 return rwco.ret;
1505}
b338082b 1506
1c9805a3
SH
1507/* return < 0 if error. See bdrv_write() for the return codes */
1508int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1509 uint8_t *buf, int nb_sectors)
1510{
1511 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
fc01f7e7
FB
1512}
1513
7cd1e32a 1514static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
a55eb92c 1515 int nb_sectors, int dirty)
7cd1e32a 1516{
1517 int64_t start, end;
c6d22830 1518 unsigned long val, idx, bit;
a55eb92c 1519
6ea44308 1520 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
c6d22830 1521 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
a55eb92c
JK
1522
1523 for (; start <= end; start++) {
c6d22830
JK
1524 idx = start / (sizeof(unsigned long) * 8);
1525 bit = start % (sizeof(unsigned long) * 8);
1526 val = bs->dirty_bitmap[idx];
1527 if (dirty) {
6d59fec1 1528 if (!(val & (1UL << bit))) {
aaa0eb75 1529 bs->dirty_count++;
6d59fec1 1530 val |= 1UL << bit;
aaa0eb75 1531 }
c6d22830 1532 } else {
6d59fec1 1533 if (val & (1UL << bit)) {
aaa0eb75 1534 bs->dirty_count--;
6d59fec1 1535 val &= ~(1UL << bit);
aaa0eb75 1536 }
c6d22830
JK
1537 }
1538 bs->dirty_bitmap[idx] = val;
7cd1e32a 1539 }
1540}
1541
5fafdf24 1542/* Return < 0 if error. Important errors are:
19cb3738
FB
1543 -EIO generic I/O error (may happen for all errors)
1544 -ENOMEDIUM No media inserted.
1545 -EINVAL Invalid sector number or nb_sectors
1546 -EACCES Trying to write a read-only device
1547*/
5fafdf24 1548int bdrv_write(BlockDriverState *bs, int64_t sector_num,
fc01f7e7
FB
1549 const uint8_t *buf, int nb_sectors)
1550{
1c9805a3 1551 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
83f64091
FB
1552}
1553
eda578e5
AL
1554int bdrv_pread(BlockDriverState *bs, int64_t offset,
1555 void *buf, int count1)
83f64091 1556{
6ea44308 1557 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
83f64091
FB
1558 int len, nb_sectors, count;
1559 int64_t sector_num;
9a8c4cce 1560 int ret;
83f64091
FB
1561
1562 count = count1;
1563 /* first read to align to sector start */
6ea44308 1564 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
83f64091
FB
1565 if (len > count)
1566 len = count;
6ea44308 1567 sector_num = offset >> BDRV_SECTOR_BITS;
83f64091 1568 if (len > 0) {
9a8c4cce
KW
1569 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1570 return ret;
6ea44308 1571 memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
83f64091
FB
1572 count -= len;
1573 if (count == 0)
1574 return count1;
1575 sector_num++;
1576 buf += len;
1577 }
1578
1579 /* read the sectors "in place" */
6ea44308 1580 nb_sectors = count >> BDRV_SECTOR_BITS;
83f64091 1581 if (nb_sectors > 0) {
9a8c4cce
KW
1582 if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1583 return ret;
83f64091 1584 sector_num += nb_sectors;
6ea44308 1585 len = nb_sectors << BDRV_SECTOR_BITS;
83f64091
FB
1586 buf += len;
1587 count -= len;
1588 }
1589
1590 /* add data from the last sector */
1591 if (count > 0) {
9a8c4cce
KW
1592 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1593 return ret;
83f64091
FB
1594 memcpy(buf, tmp_buf, count);
1595 }
1596 return count1;
1597}
1598
eda578e5
AL
1599int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1600 const void *buf, int count1)
83f64091 1601{
6ea44308 1602 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
83f64091
FB
1603 int len, nb_sectors, count;
1604 int64_t sector_num;
9a8c4cce 1605 int ret;
83f64091
FB
1606
1607 count = count1;
1608 /* first write to align to sector start */
6ea44308 1609 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
83f64091
FB
1610 if (len > count)
1611 len = count;
6ea44308 1612 sector_num = offset >> BDRV_SECTOR_BITS;
83f64091 1613 if (len > 0) {
9a8c4cce
KW
1614 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1615 return ret;
6ea44308 1616 memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
9a8c4cce
KW
1617 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1618 return ret;
83f64091
FB
1619 count -= len;
1620 if (count == 0)
1621 return count1;
1622 sector_num++;
1623 buf += len;
1624 }
1625
1626 /* write the sectors "in place" */
6ea44308 1627 nb_sectors = count >> BDRV_SECTOR_BITS;
83f64091 1628 if (nb_sectors > 0) {
9a8c4cce
KW
1629 if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1630 return ret;
83f64091 1631 sector_num += nb_sectors;
6ea44308 1632 len = nb_sectors << BDRV_SECTOR_BITS;
83f64091
FB
1633 buf += len;
1634 count -= len;
1635 }
1636
1637 /* add data from the last sector */
1638 if (count > 0) {
9a8c4cce
KW
1639 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1640 return ret;
83f64091 1641 memcpy(tmp_buf, buf, count);
9a8c4cce
KW
1642 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1643 return ret;
83f64091
FB
1644 }
1645 return count1;
1646}
83f64091 1647
f08145fe
KW
1648/*
1649 * Writes to the file and ensures that no writes are reordered across this
1650 * request (acts as a barrier)
1651 *
1652 * Returns 0 on success, -errno in error cases.
1653 */
1654int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1655 const void *buf, int count)
1656{
1657 int ret;
1658
1659 ret = bdrv_pwrite(bs, offset, buf, count);
1660 if (ret < 0) {
1661 return ret;
1662 }
1663
92196b2f
SH
1664 /* No flush needed for cache modes that use O_DSYNC */
1665 if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
f08145fe
KW
1666 bdrv_flush(bs);
1667 }
1668
1669 return 0;
1670}
1671
470c0504 1672static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
ab185921
SH
1673 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1674{
1675 /* Perform I/O through a temporary buffer so that users who scribble over
1676 * their read buffer while the operation is in progress do not end up
1677 * modifying the image file. This is critical for zero-copy guest I/O
1678 * where anything might happen inside guest memory.
1679 */
1680 void *bounce_buffer;
1681
79c053bd 1682 BlockDriver *drv = bs->drv;
ab185921
SH
1683 struct iovec iov;
1684 QEMUIOVector bounce_qiov;
1685 int64_t cluster_sector_num;
1686 int cluster_nb_sectors;
1687 size_t skip_bytes;
1688 int ret;
1689
1690 /* Cover entire cluster so no additional backing file I/O is required when
1691 * allocating cluster in the image file.
1692 */
1693 round_to_clusters(bs, sector_num, nb_sectors,
1694 &cluster_sector_num, &cluster_nb_sectors);
1695
470c0504
SH
1696 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1697 cluster_sector_num, cluster_nb_sectors);
ab185921
SH
1698
1699 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1700 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1701 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1702
79c053bd
SH
1703 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1704 &bounce_qiov);
ab185921
SH
1705 if (ret < 0) {
1706 goto err;
1707 }
1708
79c053bd
SH
1709 if (drv->bdrv_co_write_zeroes &&
1710 buffer_is_zero(bounce_buffer, iov.iov_len)) {
1711 ret = drv->bdrv_co_write_zeroes(bs, cluster_sector_num,
1712 cluster_nb_sectors);
1713 } else {
1714 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
ab185921 1715 &bounce_qiov);
79c053bd
SH
1716 }
1717
ab185921
SH
1718 if (ret < 0) {
1719 /* It might be okay to ignore write errors for guest requests. If this
1720 * is a deliberate copy-on-read then we don't want to ignore the error.
1721 * Simply report it in all cases.
1722 */
1723 goto err;
1724 }
1725
1726 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1727 qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
1728 nb_sectors * BDRV_SECTOR_SIZE);
1729
1730err:
1731 qemu_vfree(bounce_buffer);
1732 return ret;
1733}
1734
c5fbe571
SH
1735/*
1736 * Handle a read request in coroutine context
1737 */
1738static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
470c0504
SH
1739 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1740 BdrvRequestFlags flags)
da1fa91d
KW
1741{
1742 BlockDriver *drv = bs->drv;
dbffbdcf
SH
1743 BdrvTrackedRequest req;
1744 int ret;
da1fa91d 1745
da1fa91d
KW
1746 if (!drv) {
1747 return -ENOMEDIUM;
1748 }
1749 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1750 return -EIO;
1751 }
1752
98f90dba
ZYW
1753 /* throttling disk read I/O */
1754 if (bs->io_limits_enabled) {
1755 bdrv_io_limits_intercept(bs, false, nb_sectors);
1756 }
1757
f4658285 1758 if (bs->copy_on_read) {
470c0504
SH
1759 flags |= BDRV_REQ_COPY_ON_READ;
1760 }
1761 if (flags & BDRV_REQ_COPY_ON_READ) {
1762 bs->copy_on_read_in_flight++;
1763 }
1764
1765 if (bs->copy_on_read_in_flight) {
f4658285
SH
1766 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1767 }
1768
dbffbdcf 1769 tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
ab185921 1770
470c0504 1771 if (flags & BDRV_REQ_COPY_ON_READ) {
ab185921
SH
1772 int pnum;
1773
1774 ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1775 if (ret < 0) {
1776 goto out;
1777 }
1778
1779 if (!ret || pnum != nb_sectors) {
470c0504 1780 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
ab185921
SH
1781 goto out;
1782 }
1783 }
1784
dbffbdcf 1785 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
ab185921
SH
1786
1787out:
dbffbdcf 1788 tracked_request_end(&req);
470c0504
SH
1789
1790 if (flags & BDRV_REQ_COPY_ON_READ) {
1791 bs->copy_on_read_in_flight--;
1792 }
1793
dbffbdcf 1794 return ret;
da1fa91d
KW
1795}
1796
c5fbe571 1797int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
da1fa91d
KW
1798 int nb_sectors, QEMUIOVector *qiov)
1799{
c5fbe571 1800 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
da1fa91d 1801
470c0504
SH
1802 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1803}
1804
1805int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1806 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1807{
1808 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1809
1810 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1811 BDRV_REQ_COPY_ON_READ);
c5fbe571
SH
1812}
1813
f08f2dda
SH
1814static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1815 int64_t sector_num, int nb_sectors)
1816{
1817 BlockDriver *drv = bs->drv;
1818 QEMUIOVector qiov;
1819 struct iovec iov;
1820 int ret;
1821
1822 /* First try the efficient write zeroes operation */
1823 if (drv->bdrv_co_write_zeroes) {
1824 return drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1825 }
1826
1827 /* Fall back to bounce buffer if write zeroes is unsupported */
1828 iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
1829 iov.iov_base = qemu_blockalign(bs, iov.iov_len);
1830 memset(iov.iov_base, 0, iov.iov_len);
1831 qemu_iovec_init_external(&qiov, &iov, 1);
1832
1833 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1834
1835 qemu_vfree(iov.iov_base);
1836 return ret;
1837}
1838
c5fbe571
SH
1839/*
1840 * Handle a write request in coroutine context
1841 */
1842static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
f08f2dda
SH
1843 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1844 BdrvRequestFlags flags)
c5fbe571
SH
1845{
1846 BlockDriver *drv = bs->drv;
dbffbdcf 1847 BdrvTrackedRequest req;
6b7cb247 1848 int ret;
da1fa91d
KW
1849
1850 if (!bs->drv) {
1851 return -ENOMEDIUM;
1852 }
1853 if (bs->read_only) {
1854 return -EACCES;
1855 }
1856 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1857 return -EIO;
1858 }
1859
98f90dba
ZYW
1860 /* throttling disk write I/O */
1861 if (bs->io_limits_enabled) {
1862 bdrv_io_limits_intercept(bs, true, nb_sectors);
1863 }
1864
470c0504 1865 if (bs->copy_on_read_in_flight) {
f4658285
SH
1866 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1867 }
1868
dbffbdcf
SH
1869 tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
1870
f08f2dda
SH
1871 if (flags & BDRV_REQ_ZERO_WRITE) {
1872 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
1873 } else {
1874 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1875 }
6b7cb247 1876
da1fa91d
KW
1877 if (bs->dirty_bitmap) {
1878 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1879 }
1880
1881 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1882 bs->wr_highest_sector = sector_num + nb_sectors - 1;
1883 }
1884
dbffbdcf
SH
1885 tracked_request_end(&req);
1886
6b7cb247 1887 return ret;
da1fa91d
KW
1888}
1889
c5fbe571
SH
1890int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1891 int nb_sectors, QEMUIOVector *qiov)
1892{
1893 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1894
f08f2dda
SH
1895 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
1896}
1897
1898int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
1899 int64_t sector_num, int nb_sectors)
1900{
1901 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1902
1903 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
1904 BDRV_REQ_ZERO_WRITE);
c5fbe571
SH
1905}
1906
83f64091
FB
1907/**
1908 * Truncate file to 'offset' bytes (needed only for file protocols)
1909 */
1910int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1911{
1912 BlockDriver *drv = bs->drv;
51762288 1913 int ret;
83f64091 1914 if (!drv)
19cb3738 1915 return -ENOMEDIUM;
83f64091
FB
1916 if (!drv->bdrv_truncate)
1917 return -ENOTSUP;
59f2689d
NS
1918 if (bs->read_only)
1919 return -EACCES;
8591675f
MT
1920 if (bdrv_in_use(bs))
1921 return -EBUSY;
51762288
SH
1922 ret = drv->bdrv_truncate(bs, offset);
1923 if (ret == 0) {
1924 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
145feb17 1925 bdrv_dev_resize_cb(bs);
51762288
SH
1926 }
1927 return ret;
83f64091
FB
1928}
1929
4a1d5e1f
FZ
1930/**
1931 * Length of a allocated file in bytes. Sparse files are counted by actual
1932 * allocated space. Return < 0 if error or unknown.
1933 */
1934int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
1935{
1936 BlockDriver *drv = bs->drv;
1937 if (!drv) {
1938 return -ENOMEDIUM;
1939 }
1940 if (drv->bdrv_get_allocated_file_size) {
1941 return drv->bdrv_get_allocated_file_size(bs);
1942 }
1943 if (bs->file) {
1944 return bdrv_get_allocated_file_size(bs->file);
1945 }
1946 return -ENOTSUP;
1947}
1948
83f64091
FB
1949/**
1950 * Length of a file in bytes. Return < 0 if error or unknown.
1951 */
1952int64_t bdrv_getlength(BlockDriverState *bs)
1953{
1954 BlockDriver *drv = bs->drv;
1955 if (!drv)
19cb3738 1956 return -ENOMEDIUM;
51762288 1957
2c6942fa 1958 if (bs->growable || bdrv_dev_has_removable_media(bs)) {
46a4e4e6
SH
1959 if (drv->bdrv_getlength) {
1960 return drv->bdrv_getlength(bs);
1961 }
83f64091 1962 }
46a4e4e6 1963 return bs->total_sectors * BDRV_SECTOR_SIZE;
fc01f7e7
FB
1964}
1965
19cb3738 1966/* return 0 as number of sectors if no device present or error */
96b8f136 1967void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
fc01f7e7 1968{
19cb3738
FB
1969 int64_t length;
1970 length = bdrv_getlength(bs);
1971 if (length < 0)
1972 length = 0;
1973 else
6ea44308 1974 length = length >> BDRV_SECTOR_BITS;
19cb3738 1975 *nb_sectors_ptr = length;
fc01f7e7 1976}
cf98951b 1977
f3d54fc4
AL
1978struct partition {
1979 uint8_t boot_ind; /* 0x80 - active */
1980 uint8_t head; /* starting head */
1981 uint8_t sector; /* starting sector */
1982 uint8_t cyl; /* starting cylinder */
1983 uint8_t sys_ind; /* What partition type */
1984 uint8_t end_head; /* end head */
1985 uint8_t end_sector; /* end sector */
1986 uint8_t end_cyl; /* end cylinder */
1987 uint32_t start_sect; /* starting sector counting from 0 */
1988 uint32_t nr_sects; /* nr of sectors in partition */
541dc0d4 1989} QEMU_PACKED;
f3d54fc4
AL
1990
1991/* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
1992static int guess_disk_lchs(BlockDriverState *bs,
1993 int *pcylinders, int *pheads, int *psectors)
1994{
eb5a3165 1995 uint8_t buf[BDRV_SECTOR_SIZE];
f3d54fc4
AL
1996 int ret, i, heads, sectors, cylinders;
1997 struct partition *p;
1998 uint32_t nr_sects;
a38131b6 1999 uint64_t nb_sectors;
498e386c 2000 bool enabled;
f3d54fc4
AL
2001
2002 bdrv_get_geometry(bs, &nb_sectors);
2003
498e386c
ZYW
2004 /**
2005 * The function will be invoked during startup not only in sync I/O mode,
2006 * but also in async I/O mode. So the I/O throttling function has to
2007 * be disabled temporarily here, not permanently.
2008 */
2009 enabled = bs->io_limits_enabled;
2010 bs->io_limits_enabled = false;
f3d54fc4 2011 ret = bdrv_read(bs, 0, buf, 1);
498e386c 2012 bs->io_limits_enabled = enabled;
f3d54fc4
AL
2013 if (ret < 0)
2014 return -1;
2015 /* test msdos magic */
2016 if (buf[510] != 0x55 || buf[511] != 0xaa)
2017 return -1;
2018 for(i = 0; i < 4; i++) {
2019 p = ((struct partition *)(buf + 0x1be)) + i;
2020 nr_sects = le32_to_cpu(p->nr_sects);
2021 if (nr_sects && p->end_head) {
2022 /* We make the assumption that the partition terminates on
2023 a cylinder boundary */
2024 heads = p->end_head + 1;
2025 sectors = p->end_sector & 63;
2026 if (sectors == 0)
2027 continue;
2028 cylinders = nb_sectors / (heads * sectors);
2029 if (cylinders < 1 || cylinders > 16383)
2030 continue;
2031 *pheads = heads;
2032 *psectors = sectors;
2033 *pcylinders = cylinders;
2034#if 0
2035 printf("guessed geometry: LCHS=%d %d %d\n",
2036 cylinders, heads, sectors);
2037#endif
2038 return 0;
2039 }
2040 }
2041 return -1;
2042}
2043
2044void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
2045{
2046 int translation, lba_detected = 0;
2047 int cylinders, heads, secs;
a38131b6 2048 uint64_t nb_sectors;
f3d54fc4
AL
2049
2050 /* if a geometry hint is available, use it */
2051 bdrv_get_geometry(bs, &nb_sectors);
2052 bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
2053 translation = bdrv_get_translation_hint(bs);
2054 if (cylinders != 0) {
2055 *pcyls = cylinders;
2056 *pheads = heads;
2057 *psecs = secs;
2058 } else {
2059 if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
2060 if (heads > 16) {
2061 /* if heads > 16, it means that a BIOS LBA
2062 translation was active, so the default
2063 hardware geometry is OK */
2064 lba_detected = 1;
2065 goto default_geometry;
2066 } else {
2067 *pcyls = cylinders;
2068 *pheads = heads;
2069 *psecs = secs;
2070 /* disable any translation to be in sync with
2071 the logical geometry */
2072 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
2073 bdrv_set_translation_hint(bs,
2074 BIOS_ATA_TRANSLATION_NONE);
2075 }
2076 }
2077 } else {
2078 default_geometry:
2079 /* if no geometry, use a standard physical disk geometry */
2080 cylinders = nb_sectors / (16 * 63);
2081
2082 if (cylinders > 16383)
2083 cylinders = 16383;
2084 else if (cylinders < 2)
2085 cylinders = 2;
2086 *pcyls = cylinders;
2087 *pheads = 16;
2088 *psecs = 63;
2089 if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
2090 if ((*pcyls * *pheads) <= 131072) {
2091 bdrv_set_translation_hint(bs,
2092 BIOS_ATA_TRANSLATION_LARGE);
2093 } else {
2094 bdrv_set_translation_hint(bs,
2095 BIOS_ATA_TRANSLATION_LBA);
2096 }
2097 }
2098 }
2099 bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
2100 }
2101}
2102
5fafdf24 2103void bdrv_set_geometry_hint(BlockDriverState *bs,
b338082b
FB
2104 int cyls, int heads, int secs)
2105{
2106 bs->cyls = cyls;
2107 bs->heads = heads;
2108 bs->secs = secs;
2109}
2110
46d4767d
FB
2111void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
2112{
2113 bs->translation = translation;
2114}
2115
5fafdf24 2116void bdrv_get_geometry_hint(BlockDriverState *bs,
b338082b
FB
2117 int *pcyls, int *pheads, int *psecs)
2118{
2119 *pcyls = bs->cyls;
2120 *pheads = bs->heads;
2121 *psecs = bs->secs;
2122}
2123
0563e191
ZYW
2124/* throttling disk io limits */
2125void bdrv_set_io_limits(BlockDriverState *bs,
2126 BlockIOLimit *io_limits)
2127{
2128 bs->io_limits = *io_limits;
2129 bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2130}
2131
5bbdbb46
BS
2132/* Recognize floppy formats */
2133typedef struct FDFormat {
2134 FDriveType drive;
2135 uint8_t last_sect;
2136 uint8_t max_track;
2137 uint8_t max_head;
f8d3d128 2138 FDriveRate rate;
5bbdbb46
BS
2139} FDFormat;
2140
2141static const FDFormat fd_formats[] = {
2142 /* First entry is default format */
2143 /* 1.44 MB 3"1/2 floppy disks */
f8d3d128
HP
2144 { FDRIVE_DRV_144, 18, 80, 1, FDRIVE_RATE_500K, },
2145 { FDRIVE_DRV_144, 20, 80, 1, FDRIVE_RATE_500K, },
2146 { FDRIVE_DRV_144, 21, 80, 1, FDRIVE_RATE_500K, },
2147 { FDRIVE_DRV_144, 21, 82, 1, FDRIVE_RATE_500K, },
2148 { FDRIVE_DRV_144, 21, 83, 1, FDRIVE_RATE_500K, },
2149 { FDRIVE_DRV_144, 22, 80, 1, FDRIVE_RATE_500K, },
2150 { FDRIVE_DRV_144, 23, 80, 1, FDRIVE_RATE_500K, },
2151 { FDRIVE_DRV_144, 24, 80, 1, FDRIVE_RATE_500K, },
5bbdbb46 2152 /* 2.88 MB 3"1/2 floppy disks */
f8d3d128
HP
2153 { FDRIVE_DRV_288, 36, 80, 1, FDRIVE_RATE_1M, },
2154 { FDRIVE_DRV_288, 39, 80, 1, FDRIVE_RATE_1M, },
2155 { FDRIVE_DRV_288, 40, 80, 1, FDRIVE_RATE_1M, },
2156 { FDRIVE_DRV_288, 44, 80, 1, FDRIVE_RATE_1M, },
2157 { FDRIVE_DRV_288, 48, 80, 1, FDRIVE_RATE_1M, },
5bbdbb46 2158 /* 720 kB 3"1/2 floppy disks */
f8d3d128
HP
2159 { FDRIVE_DRV_144, 9, 80, 1, FDRIVE_RATE_250K, },
2160 { FDRIVE_DRV_144, 10, 80, 1, FDRIVE_RATE_250K, },
2161 { FDRIVE_DRV_144, 10, 82, 1, FDRIVE_RATE_250K, },
2162 { FDRIVE_DRV_144, 10, 83, 1, FDRIVE_RATE_250K, },
2163 { FDRIVE_DRV_144, 13, 80, 1, FDRIVE_RATE_250K, },
2164 { FDRIVE_DRV_144, 14, 80, 1, FDRIVE_RATE_250K, },
5bbdbb46 2165 /* 1.2 MB 5"1/4 floppy disks */
f8d3d128
HP
2166 { FDRIVE_DRV_120, 15, 80, 1, FDRIVE_RATE_500K, },
2167 { FDRIVE_DRV_120, 18, 80, 1, FDRIVE_RATE_500K, },
2168 { FDRIVE_DRV_120, 18, 82, 1, FDRIVE_RATE_500K, },
2169 { FDRIVE_DRV_120, 18, 83, 1, FDRIVE_RATE_500K, },
2170 { FDRIVE_DRV_120, 20, 80, 1, FDRIVE_RATE_500K, },
5bbdbb46 2171 /* 720 kB 5"1/4 floppy disks */
f8d3d128
HP
2172 { FDRIVE_DRV_120, 9, 80, 1, FDRIVE_RATE_250K, },
2173 { FDRIVE_DRV_120, 11, 80, 1, FDRIVE_RATE_250K, },
5bbdbb46 2174 /* 360 kB 5"1/4 floppy disks */
f8d3d128
HP
2175 { FDRIVE_DRV_120, 9, 40, 1, FDRIVE_RATE_300K, },
2176 { FDRIVE_DRV_120, 9, 40, 0, FDRIVE_RATE_300K, },
2177 { FDRIVE_DRV_120, 10, 41, 1, FDRIVE_RATE_300K, },
2178 { FDRIVE_DRV_120, 10, 42, 1, FDRIVE_RATE_300K, },
5bbdbb46 2179 /* 320 kB 5"1/4 floppy disks */
f8d3d128
HP
2180 { FDRIVE_DRV_120, 8, 40, 1, FDRIVE_RATE_250K, },
2181 { FDRIVE_DRV_120, 8, 40, 0, FDRIVE_RATE_250K, },
5bbdbb46 2182 /* 360 kB must match 5"1/4 better than 3"1/2... */
f8d3d128 2183 { FDRIVE_DRV_144, 9, 80, 0, FDRIVE_RATE_250K, },
5bbdbb46 2184 /* end */
f8d3d128 2185 { FDRIVE_DRV_NONE, -1, -1, 0, 0, },
5bbdbb46
BS
2186};
2187
2188void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
2189 int *max_track, int *last_sect,
f8d3d128
HP
2190 FDriveType drive_in, FDriveType *drive,
2191 FDriveRate *rate)
5bbdbb46
BS
2192{
2193 const FDFormat *parse;
2194 uint64_t nb_sectors, size;
2195 int i, first_match, match;
2196
2197 bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
2198 if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
2199 /* User defined disk */
f8d3d128 2200 *rate = FDRIVE_RATE_500K;
5bbdbb46
BS
2201 } else {
2202 bdrv_get_geometry(bs, &nb_sectors);
2203 match = -1;
2204 first_match = -1;
2205 for (i = 0; ; i++) {
2206 parse = &fd_formats[i];
2207 if (parse->drive == FDRIVE_DRV_NONE) {
2208 break;
2209 }
2210 if (drive_in == parse->drive ||
2211 drive_in == FDRIVE_DRV_NONE) {
2212 size = (parse->max_head + 1) * parse->max_track *
2213 parse->last_sect;
2214 if (nb_sectors == size) {
2215 match = i;
2216 break;
2217 }
2218 if (first_match == -1) {
2219 first_match = i;
2220 }
2221 }
2222 }
2223 if (match == -1) {
2224 if (first_match == -1) {
2225 match = 1;
2226 } else {
2227 match = first_match;
2228 }
2229 parse = &fd_formats[match];
2230 }
2231 *nb_heads = parse->max_head + 1;
2232 *max_track = parse->max_track;
2233 *last_sect = parse->last_sect;
2234 *drive = parse->drive;
f8d3d128 2235 *rate = parse->rate;
5bbdbb46
BS
2236 }
2237}
2238
46d4767d
FB
2239int bdrv_get_translation_hint(BlockDriverState *bs)
2240{
2241 return bs->translation;
2242}
2243
abd7f68d
MA
2244void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2245 BlockErrorAction on_write_error)
2246{
2247 bs->on_read_error = on_read_error;
2248 bs->on_write_error = on_write_error;
2249}
2250
2251BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2252{
2253 return is_read ? bs->on_read_error : bs->on_write_error;
2254}
2255
b338082b
FB
2256int bdrv_is_read_only(BlockDriverState *bs)
2257{
2258 return bs->read_only;
2259}
2260
985a03b0
TS
2261int bdrv_is_sg(BlockDriverState *bs)
2262{
2263 return bs->sg;
2264}
2265
e900a7b7
CH
2266int bdrv_enable_write_cache(BlockDriverState *bs)
2267{
2268 return bs->enable_write_cache;
2269}
2270
ea2384d3
FB
2271int bdrv_is_encrypted(BlockDriverState *bs)
2272{
2273 if (bs->backing_hd && bs->backing_hd->encrypted)
2274 return 1;
2275 return bs->encrypted;
2276}
2277
c0f4ce77
AL
2278int bdrv_key_required(BlockDriverState *bs)
2279{
2280 BlockDriverState *backing_hd = bs->backing_hd;
2281
2282 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2283 return 1;
2284 return (bs->encrypted && !bs->valid_key);
2285}
2286
ea2384d3
FB
2287int bdrv_set_key(BlockDriverState *bs, const char *key)
2288{
2289 int ret;
2290 if (bs->backing_hd && bs->backing_hd->encrypted) {
2291 ret = bdrv_set_key(bs->backing_hd, key);
2292 if (ret < 0)
2293 return ret;
2294 if (!bs->encrypted)
2295 return 0;
2296 }
fd04a2ae
SH
2297 if (!bs->encrypted) {
2298 return -EINVAL;
2299 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2300 return -ENOMEDIUM;
2301 }
c0f4ce77 2302 ret = bs->drv->bdrv_set_key(bs, key);
bb5fc20f
AL
2303 if (ret < 0) {
2304 bs->valid_key = 0;
2305 } else if (!bs->valid_key) {
2306 bs->valid_key = 1;
2307 /* call the change callback now, we skipped it on open */
7d4b4ba5 2308 bdrv_dev_change_media_cb(bs, true);
bb5fc20f 2309 }
c0f4ce77 2310 return ret;
ea2384d3
FB
2311}
2312
2313void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
2314{
19cb3738 2315 if (!bs->drv) {
ea2384d3
FB
2316 buf[0] = '\0';
2317 } else {
2318 pstrcpy(buf, buf_size, bs->drv->format_name);
2319 }
2320}
2321
5fafdf24 2322void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
ea2384d3
FB
2323 void *opaque)
2324{
2325 BlockDriver *drv;
2326
8a22f02a 2327 QLIST_FOREACH(drv, &bdrv_drivers, list) {
ea2384d3
FB
2328 it(opaque, drv->format_name);
2329 }
2330}
2331
b338082b
FB
2332BlockDriverState *bdrv_find(const char *name)
2333{
2334 BlockDriverState *bs;
2335
1b7bdbc1
SH
2336 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2337 if (!strcmp(name, bs->device_name)) {
b338082b 2338 return bs;
1b7bdbc1 2339 }
b338082b
FB
2340 }
2341 return NULL;
2342}
2343
2f399b0a
MA
2344BlockDriverState *bdrv_next(BlockDriverState *bs)
2345{
2346 if (!bs) {
2347 return QTAILQ_FIRST(&bdrv_states);
2348 }
2349 return QTAILQ_NEXT(bs, list);
2350}
2351
51de9760 2352void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
81d0912d
FB
2353{
2354 BlockDriverState *bs;
2355
1b7bdbc1 2356 QTAILQ_FOREACH(bs, &bdrv_states, list) {
51de9760 2357 it(opaque, bs);
81d0912d
FB
2358 }
2359}
2360
ea2384d3
FB
2361const char *bdrv_get_device_name(BlockDriverState *bs)
2362{
2363 return bs->device_name;
2364}
2365
c6ca28d6
AL
2366void bdrv_flush_all(void)
2367{
2368 BlockDriverState *bs;
2369
1b7bdbc1 2370 QTAILQ_FOREACH(bs, &bdrv_states, list) {
29cdb251 2371 bdrv_flush(bs);
1b7bdbc1 2372 }
c6ca28d6
AL
2373}
2374
f2feebbd
KW
2375int bdrv_has_zero_init(BlockDriverState *bs)
2376{
2377 assert(bs->drv);
2378
336c1c12
KW
2379 if (bs->drv->bdrv_has_zero_init) {
2380 return bs->drv->bdrv_has_zero_init(bs);
f2feebbd
KW
2381 }
2382
2383 return 1;
2384}
2385
376ae3f1
SH
2386typedef struct BdrvCoIsAllocatedData {
2387 BlockDriverState *bs;
2388 int64_t sector_num;
2389 int nb_sectors;
2390 int *pnum;
2391 int ret;
2392 bool done;
2393} BdrvCoIsAllocatedData;
2394
f58c7b35
TS
2395/*
2396 * Returns true iff the specified sector is present in the disk image. Drivers
2397 * not implementing the functionality are assumed to not support backing files,
2398 * hence all their sectors are reported as allocated.
2399 *
bd9533e3
SH
2400 * If 'sector_num' is beyond the end of the disk image the return value is 0
2401 * and 'pnum' is set to 0.
2402 *
f58c7b35
TS
2403 * 'pnum' is set to the number of sectors (including and immediately following
2404 * the specified sector) that are known to be in the same
2405 * allocated/unallocated state.
2406 *
bd9533e3
SH
2407 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
2408 * beyond the end of the disk image it will be clamped.
f58c7b35 2409 */
060f51c9
SH
2410int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2411 int nb_sectors, int *pnum)
f58c7b35 2412{
bd9533e3
SH
2413 int64_t n;
2414
2415 if (sector_num >= bs->total_sectors) {
2416 *pnum = 0;
2417 return 0;
2418 }
2419
2420 n = bs->total_sectors - sector_num;
2421 if (n < nb_sectors) {
2422 nb_sectors = n;
2423 }
2424
6aebab14 2425 if (!bs->drv->bdrv_co_is_allocated) {
bd9533e3 2426 *pnum = nb_sectors;
f58c7b35
TS
2427 return 1;
2428 }
6aebab14 2429
060f51c9
SH
2430 return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2431}
2432
2433/* Coroutine wrapper for bdrv_is_allocated() */
2434static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2435{
2436 BdrvCoIsAllocatedData *data = opaque;
2437 BlockDriverState *bs = data->bs;
2438
2439 data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2440 data->pnum);
2441 data->done = true;
2442}
2443
2444/*
2445 * Synchronous wrapper around bdrv_co_is_allocated().
2446 *
2447 * See bdrv_co_is_allocated() for details.
2448 */
2449int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2450 int *pnum)
2451{
6aebab14
SH
2452 Coroutine *co;
2453 BdrvCoIsAllocatedData data = {
2454 .bs = bs,
2455 .sector_num = sector_num,
2456 .nb_sectors = nb_sectors,
2457 .pnum = pnum,
2458 .done = false,
2459 };
2460
2461 co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2462 qemu_coroutine_enter(co, &data);
2463 while (!data.done) {
2464 qemu_aio_wait();
2465 }
2466 return data.ret;
f58c7b35
TS
2467}
2468
b2023818 2469BlockInfoList *qmp_query_block(Error **errp)
b338082b 2470{
b2023818 2471 BlockInfoList *head = NULL, *cur_item = NULL;
b338082b
FB
2472 BlockDriverState *bs;
2473
1b7bdbc1 2474 QTAILQ_FOREACH(bs, &bdrv_states, list) {
b2023818 2475 BlockInfoList *info = g_malloc0(sizeof(*info));
d15e5465 2476
b2023818
LC
2477 info->value = g_malloc0(sizeof(*info->value));
2478 info->value->device = g_strdup(bs->device_name);
2479 info->value->type = g_strdup("unknown");
2480 info->value->locked = bdrv_dev_is_medium_locked(bs);
2481 info->value->removable = bdrv_dev_has_removable_media(bs);
d15e5465 2482
e4def80b 2483 if (bdrv_dev_has_removable_media(bs)) {
b2023818
LC
2484 info->value->has_tray_open = true;
2485 info->value->tray_open = bdrv_dev_is_tray_open(bs);
e4def80b 2486 }
f04ef601
LC
2487
2488 if (bdrv_iostatus_is_enabled(bs)) {
b2023818
LC
2489 info->value->has_io_status = true;
2490 info->value->io_status = bs->iostatus;
f04ef601
LC
2491 }
2492
19cb3738 2493 if (bs->drv) {
b2023818
LC
2494 info->value->has_inserted = true;
2495 info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2496 info->value->inserted->file = g_strdup(bs->filename);
2497 info->value->inserted->ro = bs->read_only;
2498 info->value->inserted->drv = g_strdup(bs->drv->format_name);
2499 info->value->inserted->encrypted = bs->encrypted;
2500 if (bs->backing_file[0]) {
2501 info->value->inserted->has_backing_file = true;
2502 info->value->inserted->backing_file = g_strdup(bs->backing_file);
376253ec 2503 }
727f005e
ZYW
2504
2505 if (bs->io_limits_enabled) {
2506 info->value->inserted->bps =
2507 bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2508 info->value->inserted->bps_rd =
2509 bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2510 info->value->inserted->bps_wr =
2511 bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2512 info->value->inserted->iops =
2513 bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2514 info->value->inserted->iops_rd =
2515 bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2516 info->value->inserted->iops_wr =
2517 bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2518 }
b2023818 2519 }
d15e5465 2520
b2023818
LC
2521 /* XXX: waiting for the qapi to support GSList */
2522 if (!cur_item) {
2523 head = cur_item = info;
2524 } else {
2525 cur_item->next = info;
2526 cur_item = info;
b338082b 2527 }
b338082b 2528 }
d15e5465 2529
b2023818 2530 return head;
b338082b 2531}
a36e69dd 2532
f11f57e4
LC
2533/* Consider exposing this as a full fledged QMP command */
2534static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2535{
2536 BlockStats *s;
2537
2538 s = g_malloc0(sizeof(*s));
2539
2540 if (bs->device_name[0]) {
2541 s->has_device = true;
2542 s->device = g_strdup(bs->device_name);
294cc35f
KW
2543 }
2544
f11f57e4
LC
2545 s->stats = g_malloc0(sizeof(*s->stats));
2546 s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2547 s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2548 s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2549 s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2550 s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2551 s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2552 s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2553 s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2554 s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2555
294cc35f 2556 if (bs->file) {
f11f57e4
LC
2557 s->has_parent = true;
2558 s->parent = qmp_query_blockstat(bs->file, NULL);
294cc35f
KW
2559 }
2560
f11f57e4 2561 return s;
294cc35f
KW
2562}
2563
f11f57e4 2564BlockStatsList *qmp_query_blockstats(Error **errp)
218a536a 2565{
f11f57e4 2566 BlockStatsList *head = NULL, *cur_item = NULL;
a36e69dd
TS
2567 BlockDriverState *bs;
2568
1b7bdbc1 2569 QTAILQ_FOREACH(bs, &bdrv_states, list) {
f11f57e4
LC
2570 BlockStatsList *info = g_malloc0(sizeof(*info));
2571 info->value = qmp_query_blockstat(bs, NULL);
2572
2573 /* XXX: waiting for the qapi to support GSList */
2574 if (!cur_item) {
2575 head = cur_item = info;
2576 } else {
2577 cur_item->next = info;
2578 cur_item = info;
2579 }
a36e69dd 2580 }
218a536a 2581
f11f57e4 2582 return head;
a36e69dd 2583}
ea2384d3 2584
045df330
AL
2585const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2586{
2587 if (bs->backing_hd && bs->backing_hd->encrypted)
2588 return bs->backing_file;
2589 else if (bs->encrypted)
2590 return bs->filename;
2591 else
2592 return NULL;
2593}
2594
5fafdf24 2595void bdrv_get_backing_filename(BlockDriverState *bs,
83f64091
FB
2596 char *filename, int filename_size)
2597{
3574c608 2598 pstrcpy(filename, filename_size, bs->backing_file);
83f64091
FB
2599}
2600
5fafdf24 2601int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
faea38e7
FB
2602 const uint8_t *buf, int nb_sectors)
2603{
2604 BlockDriver *drv = bs->drv;
2605 if (!drv)
19cb3738 2606 return -ENOMEDIUM;
faea38e7
FB
2607 if (!drv->bdrv_write_compressed)
2608 return -ENOTSUP;
fbb7b4e0
KW
2609 if (bdrv_check_request(bs, sector_num, nb_sectors))
2610 return -EIO;
a55eb92c 2611
c6d22830 2612 if (bs->dirty_bitmap) {
7cd1e32a 2613 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2614 }
a55eb92c 2615
faea38e7
FB
2616 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2617}
3b46e624 2618
faea38e7
FB
2619int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2620{
2621 BlockDriver *drv = bs->drv;
2622 if (!drv)
19cb3738 2623 return -ENOMEDIUM;
faea38e7
FB
2624 if (!drv->bdrv_get_info)
2625 return -ENOTSUP;
2626 memset(bdi, 0, sizeof(*bdi));
2627 return drv->bdrv_get_info(bs, bdi);
2628}
2629
45566e9c
CH
2630int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2631 int64_t pos, int size)
178e08a5
AL
2632{
2633 BlockDriver *drv = bs->drv;
2634 if (!drv)
2635 return -ENOMEDIUM;
7cdb1f6d
MK
2636 if (drv->bdrv_save_vmstate)
2637 return drv->bdrv_save_vmstate(bs, buf, pos, size);
2638 if (bs->file)
2639 return bdrv_save_vmstate(bs->file, buf, pos, size);
2640 return -ENOTSUP;
178e08a5
AL
2641}
2642
45566e9c
CH
2643int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2644 int64_t pos, int size)
178e08a5
AL
2645{
2646 BlockDriver *drv = bs->drv;
2647 if (!drv)
2648 return -ENOMEDIUM;
7cdb1f6d
MK
2649 if (drv->bdrv_load_vmstate)
2650 return drv->bdrv_load_vmstate(bs, buf, pos, size);
2651 if (bs->file)
2652 return bdrv_load_vmstate(bs->file, buf, pos, size);
2653 return -ENOTSUP;
178e08a5
AL
2654}
2655
8b9b0cc2
KW
2656void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2657{
2658 BlockDriver *drv = bs->drv;
2659
2660 if (!drv || !drv->bdrv_debug_event) {
2661 return;
2662 }
2663
2664 return drv->bdrv_debug_event(bs, event);
2665
2666}
2667
faea38e7
FB
2668/**************************************************************/
2669/* handling of snapshots */
2670
feeee5ac
MDCF
2671int bdrv_can_snapshot(BlockDriverState *bs)
2672{
2673 BlockDriver *drv = bs->drv;
07b70bfb 2674 if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
feeee5ac
MDCF
2675 return 0;
2676 }
2677
2678 if (!drv->bdrv_snapshot_create) {
2679 if (bs->file != NULL) {
2680 return bdrv_can_snapshot(bs->file);
2681 }
2682 return 0;
2683 }
2684
2685 return 1;
2686}
2687
199630b6
BS
2688int bdrv_is_snapshot(BlockDriverState *bs)
2689{
2690 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2691}
2692
f9092b10
MA
2693BlockDriverState *bdrv_snapshots(void)
2694{
2695 BlockDriverState *bs;
2696
3ac906f7 2697 if (bs_snapshots) {
f9092b10 2698 return bs_snapshots;
3ac906f7 2699 }
f9092b10
MA
2700
2701 bs = NULL;
2702 while ((bs = bdrv_next(bs))) {
2703 if (bdrv_can_snapshot(bs)) {
3ac906f7
MA
2704 bs_snapshots = bs;
2705 return bs;
f9092b10
MA
2706 }
2707 }
2708 return NULL;
f9092b10
MA
2709}
2710
5fafdf24 2711int bdrv_snapshot_create(BlockDriverState *bs,
faea38e7
FB
2712 QEMUSnapshotInfo *sn_info)
2713{
2714 BlockDriver *drv = bs->drv;
2715 if (!drv)
19cb3738 2716 return -ENOMEDIUM;
7cdb1f6d
MK
2717 if (drv->bdrv_snapshot_create)
2718 return drv->bdrv_snapshot_create(bs, sn_info);
2719 if (bs->file)
2720 return bdrv_snapshot_create(bs->file, sn_info);
2721 return -ENOTSUP;
faea38e7
FB
2722}
2723
5fafdf24 2724int bdrv_snapshot_goto(BlockDriverState *bs,
faea38e7
FB
2725 const char *snapshot_id)
2726{
2727 BlockDriver *drv = bs->drv;
7cdb1f6d
MK
2728 int ret, open_ret;
2729
faea38e7 2730 if (!drv)
19cb3738 2731 return -ENOMEDIUM;
7cdb1f6d
MK
2732 if (drv->bdrv_snapshot_goto)
2733 return drv->bdrv_snapshot_goto(bs, snapshot_id);
2734
2735 if (bs->file) {
2736 drv->bdrv_close(bs);
2737 ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2738 open_ret = drv->bdrv_open(bs, bs->open_flags);
2739 if (open_ret < 0) {
2740 bdrv_delete(bs->file);
2741 bs->drv = NULL;
2742 return open_ret;
2743 }
2744 return ret;
2745 }
2746
2747 return -ENOTSUP;
faea38e7
FB
2748}
2749
2750int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2751{
2752 BlockDriver *drv = bs->drv;
2753 if (!drv)
19cb3738 2754 return -ENOMEDIUM;
7cdb1f6d
MK
2755 if (drv->bdrv_snapshot_delete)
2756 return drv->bdrv_snapshot_delete(bs, snapshot_id);
2757 if (bs->file)
2758 return bdrv_snapshot_delete(bs->file, snapshot_id);
2759 return -ENOTSUP;
faea38e7
FB
2760}
2761
5fafdf24 2762int bdrv_snapshot_list(BlockDriverState *bs,
faea38e7
FB
2763 QEMUSnapshotInfo **psn_info)
2764{
2765 BlockDriver *drv = bs->drv;
2766 if (!drv)
19cb3738 2767 return -ENOMEDIUM;
7cdb1f6d
MK
2768 if (drv->bdrv_snapshot_list)
2769 return drv->bdrv_snapshot_list(bs, psn_info);
2770 if (bs->file)
2771 return bdrv_snapshot_list(bs->file, psn_info);
2772 return -ENOTSUP;
faea38e7
FB
2773}
2774
51ef6727 2775int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2776 const char *snapshot_name)
2777{
2778 BlockDriver *drv = bs->drv;
2779 if (!drv) {
2780 return -ENOMEDIUM;
2781 }
2782 if (!bs->read_only) {
2783 return -EINVAL;
2784 }
2785 if (drv->bdrv_snapshot_load_tmp) {
2786 return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2787 }
2788 return -ENOTSUP;
2789}
2790
e8a6bb9c
MT
2791BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2792 const char *backing_file)
2793{
2794 if (!bs->drv) {
2795 return NULL;
2796 }
2797
2798 if (bs->backing_hd) {
2799 if (strcmp(bs->backing_file, backing_file) == 0) {
2800 return bs->backing_hd;
2801 } else {
2802 return bdrv_find_backing_image(bs->backing_hd, backing_file);
2803 }
2804 }
2805
2806 return NULL;
2807}
2808
faea38e7
FB
2809#define NB_SUFFIXES 4
2810
2811char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2812{
2813 static const char suffixes[NB_SUFFIXES] = "KMGT";
2814 int64_t base;
2815 int i;
2816
2817 if (size <= 999) {
2818 snprintf(buf, buf_size, "%" PRId64, size);
2819 } else {
2820 base = 1024;
2821 for(i = 0; i < NB_SUFFIXES; i++) {
2822 if (size < (10 * base)) {
5fafdf24 2823 snprintf(buf, buf_size, "%0.1f%c",
faea38e7
FB
2824 (double)size / base,
2825 suffixes[i]);
2826 break;
2827 } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
5fafdf24 2828 snprintf(buf, buf_size, "%" PRId64 "%c",
faea38e7
FB
2829 ((size + (base >> 1)) / base),
2830 suffixes[i]);
2831 break;
2832 }
2833 base = base * 1024;
2834 }
2835 }
2836 return buf;
2837}
2838
2839char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2840{
2841 char buf1[128], date_buf[128], clock_buf[128];
3b9f94e1
FB
2842#ifdef _WIN32
2843 struct tm *ptm;
2844#else
faea38e7 2845 struct tm tm;
3b9f94e1 2846#endif
faea38e7
FB
2847 time_t ti;
2848 int64_t secs;
2849
2850 if (!sn) {
5fafdf24
TS
2851 snprintf(buf, buf_size,
2852 "%-10s%-20s%7s%20s%15s",
faea38e7
FB
2853 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2854 } else {
2855 ti = sn->date_sec;
3b9f94e1
FB
2856#ifdef _WIN32
2857 ptm = localtime(&ti);
2858 strftime(date_buf, sizeof(date_buf),
2859 "%Y-%m-%d %H:%M:%S", ptm);
2860#else
faea38e7
FB
2861 localtime_r(&ti, &tm);
2862 strftime(date_buf, sizeof(date_buf),
2863 "%Y-%m-%d %H:%M:%S", &tm);
3b9f94e1 2864#endif
faea38e7
FB
2865 secs = sn->vm_clock_nsec / 1000000000;
2866 snprintf(clock_buf, sizeof(clock_buf),
2867 "%02d:%02d:%02d.%03d",
2868 (int)(secs / 3600),
2869 (int)((secs / 60) % 60),
5fafdf24 2870 (int)(secs % 60),
faea38e7
FB
2871 (int)((sn->vm_clock_nsec / 1000000) % 1000));
2872 snprintf(buf, buf_size,
5fafdf24 2873 "%-10s%-20s%7s%20s%15s",
faea38e7
FB
2874 sn->id_str, sn->name,
2875 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2876 date_buf,
2877 clock_buf);
2878 }
2879 return buf;
2880}
2881
ea2384d3 2882/**************************************************************/
83f64091 2883/* async I/Os */
ea2384d3 2884
3b69e4b9 2885BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
f141eafe 2886 QEMUIOVector *qiov, int nb_sectors,
3b69e4b9 2887 BlockDriverCompletionFunc *cb, void *opaque)
83f64091 2888{
bbf0a440
SH
2889 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2890
b2a61371 2891 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
8c5873d6 2892 cb, opaque, false);
ea2384d3
FB
2893}
2894
f141eafe
AL
2895BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2896 QEMUIOVector *qiov, int nb_sectors,
2897 BlockDriverCompletionFunc *cb, void *opaque)
ea2384d3 2898{
bbf0a440
SH
2899 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2900
1a6e115b 2901 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
8c5873d6 2902 cb, opaque, true);
83f64091
FB
2903}
2904
40b4f539
KW
2905
2906typedef struct MultiwriteCB {
2907 int error;
2908 int num_requests;
2909 int num_callbacks;
2910 struct {
2911 BlockDriverCompletionFunc *cb;
2912 void *opaque;
2913 QEMUIOVector *free_qiov;
40b4f539
KW
2914 } callbacks[];
2915} MultiwriteCB;
2916
2917static void multiwrite_user_cb(MultiwriteCB *mcb)
2918{
2919 int i;
2920
2921 for (i = 0; i < mcb->num_callbacks; i++) {
2922 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
1e1ea48d
SH
2923 if (mcb->callbacks[i].free_qiov) {
2924 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2925 }
7267c094 2926 g_free(mcb->callbacks[i].free_qiov);
40b4f539
KW
2927 }
2928}
2929
2930static void multiwrite_cb(void *opaque, int ret)
2931{
2932 MultiwriteCB *mcb = opaque;
2933
6d519a5f
SH
2934 trace_multiwrite_cb(mcb, ret);
2935
cb6d3ca0 2936 if (ret < 0 && !mcb->error) {
40b4f539 2937 mcb->error = ret;
40b4f539
KW
2938 }
2939
2940 mcb->num_requests--;
2941 if (mcb->num_requests == 0) {
de189a1b 2942 multiwrite_user_cb(mcb);
7267c094 2943 g_free(mcb);
40b4f539
KW
2944 }
2945}
2946
2947static int multiwrite_req_compare(const void *a, const void *b)
2948{
77be4366
CH
2949 const BlockRequest *req1 = a, *req2 = b;
2950
2951 /*
2952 * Note that we can't simply subtract req2->sector from req1->sector
2953 * here as that could overflow the return value.
2954 */
2955 if (req1->sector > req2->sector) {
2956 return 1;
2957 } else if (req1->sector < req2->sector) {
2958 return -1;
2959 } else {
2960 return 0;
2961 }
40b4f539
KW
2962}
2963
2964/*
2965 * Takes a bunch of requests and tries to merge them. Returns the number of
2966 * requests that remain after merging.
2967 */
2968static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
2969 int num_reqs, MultiwriteCB *mcb)
2970{
2971 int i, outidx;
2972
2973 // Sort requests by start sector
2974 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
2975
2976 // Check if adjacent requests touch the same clusters. If so, combine them,
2977 // filling up gaps with zero sectors.
2978 outidx = 0;
2979 for (i = 1; i < num_reqs; i++) {
2980 int merge = 0;
2981 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
2982
b6a127a1 2983 // Handle exactly sequential writes and overlapping writes.
40b4f539
KW
2984 if (reqs[i].sector <= oldreq_last) {
2985 merge = 1;
2986 }
2987
e2a305fb
CH
2988 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
2989 merge = 0;
2990 }
2991
40b4f539
KW
2992 if (merge) {
2993 size_t size;
7267c094 2994 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
40b4f539
KW
2995 qemu_iovec_init(qiov,
2996 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
2997
2998 // Add the first request to the merged one. If the requests are
2999 // overlapping, drop the last sectors of the first request.
3000 size = (reqs[i].sector - reqs[outidx].sector) << 9;
3001 qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
3002
b6a127a1
PB
3003 // We should need to add any zeros between the two requests
3004 assert (reqs[i].sector <= oldreq_last);
40b4f539
KW
3005
3006 // Add the second request
3007 qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
3008
cbf1dff2 3009 reqs[outidx].nb_sectors = qiov->size >> 9;
40b4f539
KW
3010 reqs[outidx].qiov = qiov;
3011
3012 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3013 } else {
3014 outidx++;
3015 reqs[outidx].sector = reqs[i].sector;
3016 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3017 reqs[outidx].qiov = reqs[i].qiov;
3018 }
3019 }
3020
3021 return outidx + 1;
3022}
3023
3024/*
3025 * Submit multiple AIO write requests at once.
3026 *
3027 * On success, the function returns 0 and all requests in the reqs array have
3028 * been submitted. In error case this function returns -1, and any of the
3029 * requests may or may not be submitted yet. In particular, this means that the
3030 * callback will be called for some of the requests, for others it won't. The
3031 * caller must check the error field of the BlockRequest to wait for the right
3032 * callbacks (if error != 0, no callback will be called).
3033 *
3034 * The implementation may modify the contents of the reqs array, e.g. to merge
3035 * requests. However, the fields opaque and error are left unmodified as they
3036 * are used to signal failure for a single request to the caller.
3037 */
3038int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3039{
40b4f539
KW
3040 MultiwriteCB *mcb;
3041 int i;
3042
301db7c2
RH
3043 /* don't submit writes if we don't have a medium */
3044 if (bs->drv == NULL) {
3045 for (i = 0; i < num_reqs; i++) {
3046 reqs[i].error = -ENOMEDIUM;
3047 }
3048 return -1;
3049 }
3050
40b4f539
KW
3051 if (num_reqs == 0) {
3052 return 0;
3053 }
3054
3055 // Create MultiwriteCB structure
7267c094 3056 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
40b4f539
KW
3057 mcb->num_requests = 0;
3058 mcb->num_callbacks = num_reqs;
3059
3060 for (i = 0; i < num_reqs; i++) {
3061 mcb->callbacks[i].cb = reqs[i].cb;
3062 mcb->callbacks[i].opaque = reqs[i].opaque;
3063 }
3064
3065 // Check for mergable requests
3066 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3067
6d519a5f
SH
3068 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3069
df9309fb
PB
3070 /* Run the aio requests. */
3071 mcb->num_requests = num_reqs;
40b4f539 3072 for (i = 0; i < num_reqs; i++) {
ad54ae80 3073 bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
40b4f539 3074 reqs[i].nb_sectors, multiwrite_cb, mcb);
40b4f539
KW
3075 }
3076
3077 return 0;
40b4f539
KW
3078}
3079
83f64091 3080void bdrv_aio_cancel(BlockDriverAIOCB *acb)
83f64091 3081{
6bbff9a0 3082 acb->pool->cancel(acb);
83f64091
FB
3083}
3084
98f90dba
ZYW
3085/* block I/O throttling */
3086static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3087 bool is_write, double elapsed_time, uint64_t *wait)
3088{
3089 uint64_t bps_limit = 0;
3090 double bytes_limit, bytes_base, bytes_res;
3091 double slice_time, wait_time;
3092
3093 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3094 bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3095 } else if (bs->io_limits.bps[is_write]) {
3096 bps_limit = bs->io_limits.bps[is_write];
3097 } else {
3098 if (wait) {
3099 *wait = 0;
3100 }
3101
3102 return false;
3103 }
3104
3105 slice_time = bs->slice_end - bs->slice_start;
3106 slice_time /= (NANOSECONDS_PER_SECOND);
3107 bytes_limit = bps_limit * slice_time;
3108 bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3109 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3110 bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3111 }
3112
3113 /* bytes_base: the bytes of data which have been read/written; and
3114 * it is obtained from the history statistic info.
3115 * bytes_res: the remaining bytes of data which need to be read/written.
3116 * (bytes_base + bytes_res) / bps_limit: used to calcuate
3117 * the total time for completing reading/writting all data.
3118 */
3119 bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3120
3121 if (bytes_base + bytes_res <= bytes_limit) {
3122 if (wait) {
3123 *wait = 0;
3124 }
3125
3126 return false;
3127 }
3128
3129 /* Calc approx time to dispatch */
3130 wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3131
3132 /* When the I/O rate at runtime exceeds the limits,
3133 * bs->slice_end need to be extended in order that the current statistic
3134 * info can be kept until the timer fire, so it is increased and tuned
3135 * based on the result of experiment.
3136 */
3137 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3138 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3139 if (wait) {
3140 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3141 }
3142
3143 return true;
3144}
3145
3146static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3147 double elapsed_time, uint64_t *wait)
3148{
3149 uint64_t iops_limit = 0;
3150 double ios_limit, ios_base;
3151 double slice_time, wait_time;
3152
3153 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3154 iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3155 } else if (bs->io_limits.iops[is_write]) {
3156 iops_limit = bs->io_limits.iops[is_write];
3157 } else {
3158 if (wait) {
3159 *wait = 0;
3160 }
3161
3162 return false;
3163 }
3164
3165 slice_time = bs->slice_end - bs->slice_start;
3166 slice_time /= (NANOSECONDS_PER_SECOND);
3167 ios_limit = iops_limit * slice_time;
3168 ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3169 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3170 ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3171 }
3172
3173 if (ios_base + 1 <= ios_limit) {
3174 if (wait) {
3175 *wait = 0;
3176 }
3177
3178 return false;
3179 }
3180
3181 /* Calc approx time to dispatch */
3182 wait_time = (ios_base + 1) / iops_limit;
3183 if (wait_time > elapsed_time) {
3184 wait_time = wait_time - elapsed_time;
3185 } else {
3186 wait_time = 0;
3187 }
3188
3189 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3190 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3191 if (wait) {
3192 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3193 }
3194
3195 return true;
3196}
3197
3198static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3199 bool is_write, int64_t *wait)
3200{
3201 int64_t now, max_wait;
3202 uint64_t bps_wait = 0, iops_wait = 0;
3203 double elapsed_time;
3204 int bps_ret, iops_ret;
3205
3206 now = qemu_get_clock_ns(vm_clock);
3207 if ((bs->slice_start < now)
3208 && (bs->slice_end > now)) {
3209 bs->slice_end = now + bs->slice_time;
3210 } else {
3211 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
3212 bs->slice_start = now;
3213 bs->slice_end = now + bs->slice_time;
3214
3215 bs->io_base.bytes[is_write] = bs->nr_bytes[is_write];
3216 bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3217
3218 bs->io_base.ios[is_write] = bs->nr_ops[is_write];
3219 bs->io_base.ios[!is_write] = bs->nr_ops[!is_write];
3220 }
3221
3222 elapsed_time = now - bs->slice_start;
3223 elapsed_time /= (NANOSECONDS_PER_SECOND);
3224
3225 bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors,
3226 is_write, elapsed_time, &bps_wait);
3227 iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3228 elapsed_time, &iops_wait);
3229 if (bps_ret || iops_ret) {
3230 max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3231 if (wait) {
3232 *wait = max_wait;
3233 }
3234
3235 now = qemu_get_clock_ns(vm_clock);
3236 if (bs->slice_end < now + max_wait) {
3237 bs->slice_end = now + max_wait;
3238 }
3239
3240 return true;
3241 }
3242
3243 if (wait) {
3244 *wait = 0;
3245 }
3246
3247 return false;
3248}
ce1a14dc 3249
83f64091
FB
3250/**************************************************************/
3251/* async block device emulation */
3252
c16b5a2c
CH
3253typedef struct BlockDriverAIOCBSync {
3254 BlockDriverAIOCB common;
3255 QEMUBH *bh;
3256 int ret;
3257 /* vector translation state */
3258 QEMUIOVector *qiov;
3259 uint8_t *bounce;
3260 int is_write;
3261} BlockDriverAIOCBSync;
3262
3263static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3264{
b666d239
KW
3265 BlockDriverAIOCBSync *acb =
3266 container_of(blockacb, BlockDriverAIOCBSync, common);
6a7ad299 3267 qemu_bh_delete(acb->bh);
36afc451 3268 acb->bh = NULL;
c16b5a2c
CH
3269 qemu_aio_release(acb);
3270}
3271
3272static AIOPool bdrv_em_aio_pool = {
3273 .aiocb_size = sizeof(BlockDriverAIOCBSync),
3274 .cancel = bdrv_aio_cancel_em,
3275};
3276
ce1a14dc 3277static void bdrv_aio_bh_cb(void *opaque)
83f64091 3278{
ce1a14dc 3279 BlockDriverAIOCBSync *acb = opaque;
f141eafe 3280
f141eafe
AL
3281 if (!acb->is_write)
3282 qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
ceb42de8 3283 qemu_vfree(acb->bounce);
ce1a14dc 3284 acb->common.cb(acb->common.opaque, acb->ret);
6a7ad299 3285 qemu_bh_delete(acb->bh);
36afc451 3286 acb->bh = NULL;
ce1a14dc 3287 qemu_aio_release(acb);
83f64091 3288}
beac80cd 3289
f141eafe
AL
3290static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3291 int64_t sector_num,
3292 QEMUIOVector *qiov,
3293 int nb_sectors,
3294 BlockDriverCompletionFunc *cb,
3295 void *opaque,
3296 int is_write)
3297
83f64091 3298{
ce1a14dc 3299 BlockDriverAIOCBSync *acb;
ce1a14dc 3300
c16b5a2c 3301 acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
f141eafe
AL
3302 acb->is_write = is_write;
3303 acb->qiov = qiov;
e268ca52 3304 acb->bounce = qemu_blockalign(bs, qiov->size);
3f3aace8 3305 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
f141eafe
AL
3306
3307 if (is_write) {
3308 qemu_iovec_to_buffer(acb->qiov, acb->bounce);
1ed20acf 3309 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
f141eafe 3310 } else {
1ed20acf 3311 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
f141eafe
AL
3312 }
3313
ce1a14dc 3314 qemu_bh_schedule(acb->bh);
f141eafe 3315
ce1a14dc 3316 return &acb->common;
beac80cd
FB
3317}
3318
f141eafe
AL
3319static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3320 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 3321 BlockDriverCompletionFunc *cb, void *opaque)
beac80cd 3322{
f141eafe
AL
3323 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3324}
83f64091 3325
f141eafe
AL
3326static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3327 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3328 BlockDriverCompletionFunc *cb, void *opaque)
3329{
3330 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
beac80cd 3331}
beac80cd 3332
68485420
KW
3333
3334typedef struct BlockDriverAIOCBCoroutine {
3335 BlockDriverAIOCB common;
3336 BlockRequest req;
3337 bool is_write;
3338 QEMUBH* bh;
3339} BlockDriverAIOCBCoroutine;
3340
3341static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3342{
3343 qemu_aio_flush();
3344}
3345
3346static AIOPool bdrv_em_co_aio_pool = {
3347 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
3348 .cancel = bdrv_aio_co_cancel_em,
3349};
3350
35246a68 3351static void bdrv_co_em_bh(void *opaque)
68485420
KW
3352{
3353 BlockDriverAIOCBCoroutine *acb = opaque;
3354
3355 acb->common.cb(acb->common.opaque, acb->req.error);
3356 qemu_bh_delete(acb->bh);
3357 qemu_aio_release(acb);
3358}
3359
b2a61371
SH
3360/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3361static void coroutine_fn bdrv_co_do_rw(void *opaque)
3362{
3363 BlockDriverAIOCBCoroutine *acb = opaque;
3364 BlockDriverState *bs = acb->common.bs;
3365
3366 if (!acb->is_write) {
3367 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
470c0504 3368 acb->req.nb_sectors, acb->req.qiov, 0);
b2a61371
SH
3369 } else {
3370 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
f08f2dda 3371 acb->req.nb_sectors, acb->req.qiov, 0);
b2a61371
SH
3372 }
3373
35246a68 3374 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
b2a61371
SH
3375 qemu_bh_schedule(acb->bh);
3376}
3377
68485420
KW
3378static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3379 int64_t sector_num,
3380 QEMUIOVector *qiov,
3381 int nb_sectors,
3382 BlockDriverCompletionFunc *cb,
3383 void *opaque,
8c5873d6 3384 bool is_write)
68485420
KW
3385{
3386 Coroutine *co;
3387 BlockDriverAIOCBCoroutine *acb;
3388
3389 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3390 acb->req.sector = sector_num;
3391 acb->req.nb_sectors = nb_sectors;
3392 acb->req.qiov = qiov;
3393 acb->is_write = is_write;
3394
8c5873d6 3395 co = qemu_coroutine_create(bdrv_co_do_rw);
68485420
KW
3396 qemu_coroutine_enter(co, acb);
3397
3398 return &acb->common;
3399}
3400
07f07615 3401static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
b2e12bc6 3402{
07f07615
PB
3403 BlockDriverAIOCBCoroutine *acb = opaque;
3404 BlockDriverState *bs = acb->common.bs;
b2e12bc6 3405
07f07615
PB
3406 acb->req.error = bdrv_co_flush(bs);
3407 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
b2e12bc6 3408 qemu_bh_schedule(acb->bh);
b2e12bc6
CH
3409}
3410
07f07615 3411BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
016f5cf6
AG
3412 BlockDriverCompletionFunc *cb, void *opaque)
3413{
07f07615 3414 trace_bdrv_aio_flush(bs, opaque);
016f5cf6 3415
07f07615
PB
3416 Coroutine *co;
3417 BlockDriverAIOCBCoroutine *acb;
016f5cf6 3418
07f07615
PB
3419 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3420 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3421 qemu_coroutine_enter(co, acb);
016f5cf6 3422
016f5cf6
AG
3423 return &acb->common;
3424}
3425
4265d620
PB
3426static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3427{
3428 BlockDriverAIOCBCoroutine *acb = opaque;
3429 BlockDriverState *bs = acb->common.bs;
3430
3431 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3432 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3433 qemu_bh_schedule(acb->bh);
3434}
3435
3436BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3437 int64_t sector_num, int nb_sectors,
3438 BlockDriverCompletionFunc *cb, void *opaque)
3439{
3440 Coroutine *co;
3441 BlockDriverAIOCBCoroutine *acb;
3442
3443 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3444
3445 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3446 acb->req.sector = sector_num;
3447 acb->req.nb_sectors = nb_sectors;
3448 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3449 qemu_coroutine_enter(co, acb);
3450
3451 return &acb->common;
3452}
3453
ea2384d3
FB
3454void bdrv_init(void)
3455{
5efa9d5a 3456 module_call_init(MODULE_INIT_BLOCK);
ea2384d3 3457}
ce1a14dc 3458
eb852011
MA
3459void bdrv_init_with_whitelist(void)
3460{
3461 use_bdrv_whitelist = 1;
3462 bdrv_init();
3463}
3464
c16b5a2c
CH
3465void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3466 BlockDriverCompletionFunc *cb, void *opaque)
ce1a14dc 3467{
ce1a14dc
PB
3468 BlockDriverAIOCB *acb;
3469
6bbff9a0
AL
3470 if (pool->free_aiocb) {
3471 acb = pool->free_aiocb;
3472 pool->free_aiocb = acb->next;
ce1a14dc 3473 } else {
7267c094 3474 acb = g_malloc0(pool->aiocb_size);
6bbff9a0 3475 acb->pool = pool;
ce1a14dc
PB
3476 }
3477 acb->bs = bs;
3478 acb->cb = cb;
3479 acb->opaque = opaque;
3480 return acb;
3481}
3482
3483void qemu_aio_release(void *p)
3484{
6bbff9a0
AL
3485 BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3486 AIOPool *pool = acb->pool;
3487 acb->next = pool->free_aiocb;
3488 pool->free_aiocb = acb;
ce1a14dc 3489}
19cb3738 3490
f9f05dc5
KW
3491/**************************************************************/
3492/* Coroutine block device emulation */
3493
3494typedef struct CoroutineIOCompletion {
3495 Coroutine *coroutine;
3496 int ret;
3497} CoroutineIOCompletion;
3498
3499static void bdrv_co_io_em_complete(void *opaque, int ret)
3500{
3501 CoroutineIOCompletion *co = opaque;
3502
3503 co->ret = ret;
3504 qemu_coroutine_enter(co->coroutine, NULL);
3505}
3506
3507static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3508 int nb_sectors, QEMUIOVector *iov,
3509 bool is_write)
3510{
3511 CoroutineIOCompletion co = {
3512 .coroutine = qemu_coroutine_self(),
3513 };
3514 BlockDriverAIOCB *acb;
3515
3516 if (is_write) {
a652d160
SH
3517 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3518 bdrv_co_io_em_complete, &co);
f9f05dc5 3519 } else {
a652d160
SH
3520 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3521 bdrv_co_io_em_complete, &co);
f9f05dc5
KW
3522 }
3523
59370aaa 3524 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
f9f05dc5
KW
3525 if (!acb) {
3526 return -EIO;
3527 }
3528 qemu_coroutine_yield();
3529
3530 return co.ret;
3531}
3532
3533static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3534 int64_t sector_num, int nb_sectors,
3535 QEMUIOVector *iov)
3536{
3537 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3538}
3539
3540static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3541 int64_t sector_num, int nb_sectors,
3542 QEMUIOVector *iov)
3543{
3544 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3545}
3546
07f07615 3547static void coroutine_fn bdrv_flush_co_entry(void *opaque)
e7a8a783 3548{
07f07615
PB
3549 RwCo *rwco = opaque;
3550
3551 rwco->ret = bdrv_co_flush(rwco->bs);
3552}
3553
3554int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3555{
eb489bb1
KW
3556 int ret;
3557
29cdb251 3558 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
07f07615 3559 return 0;
eb489bb1
KW
3560 }
3561
ca716364 3562 /* Write back cached data to the OS even with cache=unsafe */
eb489bb1
KW
3563 if (bs->drv->bdrv_co_flush_to_os) {
3564 ret = bs->drv->bdrv_co_flush_to_os(bs);
3565 if (ret < 0) {
3566 return ret;
3567 }
3568 }
3569
ca716364
KW
3570 /* But don't actually force it to the disk with cache=unsafe */
3571 if (bs->open_flags & BDRV_O_NO_FLUSH) {
3572 return 0;
3573 }
3574
eb489bb1 3575 if (bs->drv->bdrv_co_flush_to_disk) {
29cdb251 3576 ret = bs->drv->bdrv_co_flush_to_disk(bs);
07f07615
PB
3577 } else if (bs->drv->bdrv_aio_flush) {
3578 BlockDriverAIOCB *acb;
3579 CoroutineIOCompletion co = {
3580 .coroutine = qemu_coroutine_self(),
3581 };
3582
3583 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3584 if (acb == NULL) {
29cdb251 3585 ret = -EIO;
07f07615
PB
3586 } else {
3587 qemu_coroutine_yield();
29cdb251 3588 ret = co.ret;
07f07615 3589 }
07f07615
PB
3590 } else {
3591 /*
3592 * Some block drivers always operate in either writethrough or unsafe
3593 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3594 * know how the server works (because the behaviour is hardcoded or
3595 * depends on server-side configuration), so we can't ensure that
3596 * everything is safe on disk. Returning an error doesn't work because
3597 * that would break guests even if the server operates in writethrough
3598 * mode.
3599 *
3600 * Let's hope the user knows what he's doing.
3601 */
29cdb251 3602 ret = 0;
07f07615 3603 }
29cdb251
PB
3604 if (ret < 0) {
3605 return ret;
3606 }
3607
3608 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
3609 * in the case of cache=unsafe, so there are no useless flushes.
3610 */
3611 return bdrv_co_flush(bs->file);
07f07615
PB
3612}
3613
0f15423c
AL
3614void bdrv_invalidate_cache(BlockDriverState *bs)
3615{
3616 if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3617 bs->drv->bdrv_invalidate_cache(bs);
3618 }
3619}
3620
3621void bdrv_invalidate_cache_all(void)
3622{
3623 BlockDriverState *bs;
3624
3625 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3626 bdrv_invalidate_cache(bs);
3627 }
3628}
3629
07789269
BC
3630void bdrv_clear_incoming_migration_all(void)
3631{
3632 BlockDriverState *bs;
3633
3634 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3635 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
3636 }
3637}
3638
07f07615
PB
3639int bdrv_flush(BlockDriverState *bs)
3640{
3641 Coroutine *co;
3642 RwCo rwco = {
3643 .bs = bs,
3644 .ret = NOT_DONE,
e7a8a783 3645 };
e7a8a783 3646
07f07615
PB
3647 if (qemu_in_coroutine()) {
3648 /* Fast-path if already in coroutine context */
3649 bdrv_flush_co_entry(&rwco);
3650 } else {
3651 co = qemu_coroutine_create(bdrv_flush_co_entry);
3652 qemu_coroutine_enter(co, &rwco);
3653 while (rwco.ret == NOT_DONE) {
3654 qemu_aio_wait();
3655 }
e7a8a783 3656 }
07f07615
PB
3657
3658 return rwco.ret;
e7a8a783
KW
3659}
3660
4265d620
PB
3661static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3662{
3663 RwCo *rwco = opaque;
3664
3665 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3666}
3667
3668int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3669 int nb_sectors)
3670{
3671 if (!bs->drv) {
3672 return -ENOMEDIUM;
3673 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3674 return -EIO;
3675 } else if (bs->read_only) {
3676 return -EROFS;
3677 } else if (bs->drv->bdrv_co_discard) {
3678 return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3679 } else if (bs->drv->bdrv_aio_discard) {
3680 BlockDriverAIOCB *acb;
3681 CoroutineIOCompletion co = {
3682 .coroutine = qemu_coroutine_self(),
3683 };
3684
3685 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3686 bdrv_co_io_em_complete, &co);
3687 if (acb == NULL) {
3688 return -EIO;
3689 } else {
3690 qemu_coroutine_yield();
3691 return co.ret;
3692 }
4265d620
PB
3693 } else {
3694 return 0;
3695 }
3696}
3697
3698int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3699{
3700 Coroutine *co;
3701 RwCo rwco = {
3702 .bs = bs,
3703 .sector_num = sector_num,
3704 .nb_sectors = nb_sectors,
3705 .ret = NOT_DONE,
3706 };
3707
3708 if (qemu_in_coroutine()) {
3709 /* Fast-path if already in coroutine context */
3710 bdrv_discard_co_entry(&rwco);
3711 } else {
3712 co = qemu_coroutine_create(bdrv_discard_co_entry);
3713 qemu_coroutine_enter(co, &rwco);
3714 while (rwco.ret == NOT_DONE) {
3715 qemu_aio_wait();
3716 }
3717 }
3718
3719 return rwco.ret;
3720}
3721
19cb3738
FB
3722/**************************************************************/
3723/* removable device support */
3724
3725/**
3726 * Return TRUE if the media is present
3727 */
3728int bdrv_is_inserted(BlockDriverState *bs)
3729{
3730 BlockDriver *drv = bs->drv;
a1aff5bf 3731
19cb3738
FB
3732 if (!drv)
3733 return 0;
3734 if (!drv->bdrv_is_inserted)
a1aff5bf
MA
3735 return 1;
3736 return drv->bdrv_is_inserted(bs);
19cb3738
FB
3737}
3738
3739/**
8e49ca46
MA
3740 * Return whether the media changed since the last call to this
3741 * function, or -ENOTSUP if we don't know. Most drivers don't know.
19cb3738
FB
3742 */
3743int bdrv_media_changed(BlockDriverState *bs)
3744{
3745 BlockDriver *drv = bs->drv;
19cb3738 3746
8e49ca46
MA
3747 if (drv && drv->bdrv_media_changed) {
3748 return drv->bdrv_media_changed(bs);
3749 }
3750 return -ENOTSUP;
19cb3738
FB
3751}
3752
3753/**
3754 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3755 */
f36f3949 3756void bdrv_eject(BlockDriverState *bs, bool eject_flag)
19cb3738
FB
3757{
3758 BlockDriver *drv = bs->drv;
19cb3738 3759
822e1cd1
MA
3760 if (drv && drv->bdrv_eject) {
3761 drv->bdrv_eject(bs, eject_flag);
19cb3738 3762 }
6f382ed2
LC
3763
3764 if (bs->device_name[0] != '\0') {
3765 bdrv_emit_qmp_eject_event(bs, eject_flag);
3766 }
19cb3738
FB
3767}
3768
19cb3738
FB
3769/**
3770 * Lock or unlock the media (if it is locked, the user won't be able
3771 * to eject it manually).
3772 */
025e849a 3773void bdrv_lock_medium(BlockDriverState *bs, bool locked)
19cb3738
FB
3774{
3775 BlockDriver *drv = bs->drv;
3776
025e849a 3777 trace_bdrv_lock_medium(bs, locked);
b8c6d095 3778
025e849a
MA
3779 if (drv && drv->bdrv_lock_medium) {
3780 drv->bdrv_lock_medium(bs, locked);
19cb3738
FB
3781 }
3782}
985a03b0
TS
3783
3784/* needed for generic scsi interface */
3785
3786int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3787{
3788 BlockDriver *drv = bs->drv;
3789
3790 if (drv && drv->bdrv_ioctl)
3791 return drv->bdrv_ioctl(bs, req, buf);
3792 return -ENOTSUP;
3793}
7d780669 3794
221f715d
AL
3795BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3796 unsigned long int req, void *buf,
3797 BlockDriverCompletionFunc *cb, void *opaque)
7d780669 3798{
221f715d 3799 BlockDriver *drv = bs->drv;
7d780669 3800
221f715d
AL
3801 if (drv && drv->bdrv_aio_ioctl)
3802 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3803 return NULL;
7d780669 3804}
e268ca52 3805
7b6f9300
MA
3806void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3807{
3808 bs->buffer_alignment = align;
3809}
7cd1e32a 3810
e268ca52
AL
3811void *qemu_blockalign(BlockDriverState *bs, size_t size)
3812{
3813 return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3814}
7cd1e32a 3815
3816void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3817{
3818 int64_t bitmap_size;
a55eb92c 3819
aaa0eb75 3820 bs->dirty_count = 0;
a55eb92c 3821 if (enable) {
c6d22830
JK
3822 if (!bs->dirty_bitmap) {
3823 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3824 BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
3825 bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
a55eb92c 3826
7267c094 3827 bs->dirty_bitmap = g_malloc0(bitmap_size);
a55eb92c 3828 }
7cd1e32a 3829 } else {
c6d22830 3830 if (bs->dirty_bitmap) {
7267c094 3831 g_free(bs->dirty_bitmap);
c6d22830 3832 bs->dirty_bitmap = NULL;
a55eb92c 3833 }
7cd1e32a 3834 }
3835}
3836
3837int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3838{
6ea44308 3839 int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
a55eb92c 3840
c6d22830
JK
3841 if (bs->dirty_bitmap &&
3842 (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
6d59fec1
MT
3843 return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3844 (1UL << (chunk % (sizeof(unsigned long) * 8))));
7cd1e32a 3845 } else {
3846 return 0;
3847 }
3848}
3849
a55eb92c
JK
3850void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3851 int nr_sectors)
7cd1e32a 3852{
3853 set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3854}
aaa0eb75
LS
3855
3856int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3857{
3858 return bs->dirty_count;
3859}
f88e1a42 3860
db593f25
MT
3861void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3862{
3863 assert(bs->in_use != in_use);
3864 bs->in_use = in_use;
3865}
3866
3867int bdrv_in_use(BlockDriverState *bs)
3868{
3869 return bs->in_use;
3870}
3871
28a7282a
LC
3872void bdrv_iostatus_enable(BlockDriverState *bs)
3873{
d6bf279e 3874 bs->iostatus_enabled = true;
58e21ef5 3875 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
28a7282a
LC
3876}
3877
3878/* The I/O status is only enabled if the drive explicitly
3879 * enables it _and_ the VM is configured to stop on errors */
3880bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3881{
d6bf279e 3882 return (bs->iostatus_enabled &&
28a7282a
LC
3883 (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3884 bs->on_write_error == BLOCK_ERR_STOP_ANY ||
3885 bs->on_read_error == BLOCK_ERR_STOP_ANY));
3886}
3887
3888void bdrv_iostatus_disable(BlockDriverState *bs)
3889{
d6bf279e 3890 bs->iostatus_enabled = false;
28a7282a
LC
3891}
3892
3893void bdrv_iostatus_reset(BlockDriverState *bs)
3894{
3895 if (bdrv_iostatus_is_enabled(bs)) {
58e21ef5 3896 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
28a7282a
LC
3897 }
3898}
3899
3900/* XXX: Today this is set by device models because it makes the implementation
3901 quite simple. However, the block layer knows about the error, so it's
3902 possible to implement this without device models being involved */
3903void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3904{
58e21ef5
LC
3905 if (bdrv_iostatus_is_enabled(bs) &&
3906 bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
28a7282a 3907 assert(error >= 0);
58e21ef5
LC
3908 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
3909 BLOCK_DEVICE_IO_STATUS_FAILED;
28a7282a
LC
3910 }
3911}
3912
a597e79c
CH
3913void
3914bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
3915 enum BlockAcctType type)
3916{
3917 assert(type < BDRV_MAX_IOTYPE);
3918
3919 cookie->bytes = bytes;
c488c7f6 3920 cookie->start_time_ns = get_clock();
a597e79c
CH
3921 cookie->type = type;
3922}
3923
3924void
3925bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
3926{
3927 assert(cookie->type < BDRV_MAX_IOTYPE);
3928
3929 bs->nr_bytes[cookie->type] += cookie->bytes;
3930 bs->nr_ops[cookie->type]++;
c488c7f6 3931 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
a597e79c
CH
3932}
3933
f88e1a42
JS
3934int bdrv_img_create(const char *filename, const char *fmt,
3935 const char *base_filename, const char *base_fmt,
3936 char *options, uint64_t img_size, int flags)
3937{
3938 QEMUOptionParameter *param = NULL, *create_options = NULL;
d220894e 3939 QEMUOptionParameter *backing_fmt, *backing_file, *size;
f88e1a42
JS
3940 BlockDriverState *bs = NULL;
3941 BlockDriver *drv, *proto_drv;
96df67d1 3942 BlockDriver *backing_drv = NULL;
f88e1a42
JS
3943 int ret = 0;
3944
3945 /* Find driver and parse its options */
3946 drv = bdrv_find_format(fmt);
3947 if (!drv) {
3948 error_report("Unknown file format '%s'", fmt);
4f70f249 3949 ret = -EINVAL;
f88e1a42
JS
3950 goto out;
3951 }
3952
3953 proto_drv = bdrv_find_protocol(filename);
3954 if (!proto_drv) {
3955 error_report("Unknown protocol '%s'", filename);
4f70f249 3956 ret = -EINVAL;
f88e1a42
JS
3957 goto out;
3958 }
3959
3960 create_options = append_option_parameters(create_options,
3961 drv->create_options);
3962 create_options = append_option_parameters(create_options,
3963 proto_drv->create_options);
3964
3965 /* Create parameter list with default values */
3966 param = parse_option_parameters("", create_options, param);
3967
3968 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
3969
3970 /* Parse -o options */
3971 if (options) {
3972 param = parse_option_parameters(options, create_options, param);
3973 if (param == NULL) {
3974 error_report("Invalid options for file format '%s'.", fmt);
4f70f249 3975 ret = -EINVAL;
f88e1a42
JS
3976 goto out;
3977 }
3978 }
3979
3980 if (base_filename) {
3981 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
3982 base_filename)) {
3983 error_report("Backing file not supported for file format '%s'",
3984 fmt);
4f70f249 3985 ret = -EINVAL;
f88e1a42
JS
3986 goto out;
3987 }
3988 }
3989
3990 if (base_fmt) {
3991 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
3992 error_report("Backing file format not supported for file "
3993 "format '%s'", fmt);
4f70f249 3994 ret = -EINVAL;
f88e1a42
JS
3995 goto out;
3996 }
3997 }
3998
792da93a
JS
3999 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4000 if (backing_file && backing_file->value.s) {
4001 if (!strcmp(filename, backing_file->value.s)) {
4002 error_report("Error: Trying to create an image with the "
4003 "same filename as the backing file");
4f70f249 4004 ret = -EINVAL;
792da93a
JS
4005 goto out;
4006 }
4007 }
4008
f88e1a42
JS
4009 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4010 if (backing_fmt && backing_fmt->value.s) {
96df67d1
SH
4011 backing_drv = bdrv_find_format(backing_fmt->value.s);
4012 if (!backing_drv) {
f88e1a42
JS
4013 error_report("Unknown backing file format '%s'",
4014 backing_fmt->value.s);
4f70f249 4015 ret = -EINVAL;
f88e1a42
JS
4016 goto out;
4017 }
4018 }
4019
4020 // The size for the image must always be specified, with one exception:
4021 // If we are using a backing file, we can obtain the size from there
d220894e
KW
4022 size = get_option_parameter(param, BLOCK_OPT_SIZE);
4023 if (size && size->value.n == -1) {
f88e1a42
JS
4024 if (backing_file && backing_file->value.s) {
4025 uint64_t size;
f88e1a42
JS
4026 char buf[32];
4027
f88e1a42
JS
4028 bs = bdrv_new("");
4029
96df67d1 4030 ret = bdrv_open(bs, backing_file->value.s, flags, backing_drv);
f88e1a42 4031 if (ret < 0) {
96df67d1 4032 error_report("Could not open '%s'", backing_file->value.s);
f88e1a42
JS
4033 goto out;
4034 }
4035 bdrv_get_geometry(bs, &size);
4036 size *= 512;
4037
4038 snprintf(buf, sizeof(buf), "%" PRId64, size);
4039 set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4040 } else {
4041 error_report("Image creation needs a size parameter");
4f70f249 4042 ret = -EINVAL;
f88e1a42
JS
4043 goto out;
4044 }
4045 }
4046
4047 printf("Formatting '%s', fmt=%s ", filename, fmt);
4048 print_option_parameters(param);
4049 puts("");
4050
4051 ret = bdrv_create(drv, filename, param);
4052
4053 if (ret < 0) {
4054 if (ret == -ENOTSUP) {
4055 error_report("Formatting or formatting option not supported for "
4056 "file format '%s'", fmt);
4057 } else if (ret == -EFBIG) {
4058 error_report("The image size is too large for file format '%s'",
4059 fmt);
4060 } else {
4061 error_report("%s: error while creating %s: %s", filename, fmt,
4062 strerror(-ret));
4063 }
4064 }
4065
4066out:
4067 free_option_parameters(create_options);
4068 free_option_parameters(param);
4069
4070 if (bs) {
4071 bdrv_delete(bs);
4072 }
4f70f249
JS
4073
4074 return ret;
f88e1a42 4075}
eeec61f2
SH
4076
4077void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
4078 BlockDriverCompletionFunc *cb, void *opaque)
4079{
4080 BlockJob *job;
4081
4082 if (bs->job || bdrv_in_use(bs)) {
4083 return NULL;
4084 }
4085 bdrv_set_in_use(bs, 1);
4086
4087 job = g_malloc0(job_type->instance_size);
4088 job->job_type = job_type;
4089 job->bs = bs;
4090 job->cb = cb;
4091 job->opaque = opaque;
4092 bs->job = job;
4093 return job;
4094}
4095
4096void block_job_complete(BlockJob *job, int ret)
4097{
4098 BlockDriverState *bs = job->bs;
4099
4100 assert(bs->job == job);
4101 job->cb(job->opaque, ret);
4102 bs->job = NULL;
4103 g_free(job);
4104 bdrv_set_in_use(bs, 0);
4105}
4106
4107int block_job_set_speed(BlockJob *job, int64_t value)
4108{
9f25eccc
PB
4109 int rc;
4110
eeec61f2
SH
4111 if (!job->job_type->set_speed) {
4112 return -ENOTSUP;
4113 }
9f25eccc
PB
4114 rc = job->job_type->set_speed(job, value);
4115 if (rc == 0) {
4116 job->speed = value;
4117 }
4118 return rc;
eeec61f2
SH
4119}
4120
4121void block_job_cancel(BlockJob *job)
4122{
4123 job->cancelled = true;
4124}
4125
4126bool block_job_is_cancelled(BlockJob *job)
4127{
4128 return job->cancelled;
4129}
3e914655
PB
4130
4131void block_job_cancel_sync(BlockJob *job)
4132{
4133 BlockDriverState *bs = job->bs;
4134
4135 assert(bs->job == job);
4136 block_job_cancel(job);
4137 while (bs->job != NULL && bs->job->busy) {
4138 qemu_aio_wait();
4139 }
4140}