]> git.proxmox.com Git - mirror_qemu.git/blame - block.c
block: drop .bdrv_is_allocated() interface
[mirror_qemu.git] / block.c
CommitLineData
fc01f7e7
FB
1/*
2 * QEMU System Emulator block driver
5fafdf24 3 *
fc01f7e7 4 * Copyright (c) 2003 Fabrice Bellard
5fafdf24 5 *
fc01f7e7
FB
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
3990d09a 24#include "config-host.h"
faf07963 25#include "qemu-common.h"
6d519a5f 26#include "trace.h"
376253ec 27#include "monitor.h"
ea2384d3 28#include "block_int.h"
5efa9d5a 29#include "module.h"
f795e743 30#include "qjson.h"
68485420 31#include "qemu-coroutine.h"
b2023818 32#include "qmp-commands.h"
0563e191 33#include "qemu-timer.h"
fc01f7e7 34
71e72a19 35#ifdef CONFIG_BSD
7674e7bf
FB
36#include <sys/types.h>
37#include <sys/stat.h>
38#include <sys/ioctl.h>
72cf2d4f 39#include <sys/queue.h>
c5e97233 40#ifndef __DragonFly__
7674e7bf
FB
41#include <sys/disk.h>
42#endif
c5e97233 43#endif
7674e7bf 44
49dc768d
AL
45#ifdef _WIN32
46#include <windows.h>
47#endif
48
1c9805a3
SH
49#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
50
7d4b4ba5 51static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
f141eafe
AL
52static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
53 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
c87c0672 54 BlockDriverCompletionFunc *cb, void *opaque);
f141eafe
AL
55static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
56 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 57 BlockDriverCompletionFunc *cb, void *opaque);
f9f05dc5
KW
58static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
59 int64_t sector_num, int nb_sectors,
60 QEMUIOVector *iov);
61static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
62 int64_t sector_num, int nb_sectors,
63 QEMUIOVector *iov);
c5fbe571
SH
64static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
65 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
1c9805a3
SH
66static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
67 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
b2a61371
SH
68static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
69 int64_t sector_num,
70 QEMUIOVector *qiov,
71 int nb_sectors,
72 BlockDriverCompletionFunc *cb,
73 void *opaque,
8c5873d6 74 bool is_write);
b2a61371 75static void coroutine_fn bdrv_co_do_rw(void *opaque);
ec530c81 76
98f90dba
ZYW
77static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
78 bool is_write, double elapsed_time, uint64_t *wait);
79static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
80 double elapsed_time, uint64_t *wait);
81static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
82 bool is_write, int64_t *wait);
83
1b7bdbc1
SH
84static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
85 QTAILQ_HEAD_INITIALIZER(bdrv_states);
7ee930d0 86
8a22f02a
SH
87static QLIST_HEAD(, BlockDriver) bdrv_drivers =
88 QLIST_HEAD_INITIALIZER(bdrv_drivers);
ea2384d3 89
f9092b10
MA
90/* The device to use for VM snapshots */
91static BlockDriverState *bs_snapshots;
92
eb852011
MA
93/* If non-zero, use only whitelisted block drivers */
94static int use_bdrv_whitelist;
95
9e0b22f4
SH
96#ifdef _WIN32
97static int is_windows_drive_prefix(const char *filename)
98{
99 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
100 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
101 filename[1] == ':');
102}
103
104int is_windows_drive(const char *filename)
105{
106 if (is_windows_drive_prefix(filename) &&
107 filename[2] == '\0')
108 return 1;
109 if (strstart(filename, "\\\\.\\", NULL) ||
110 strstart(filename, "//./", NULL))
111 return 1;
112 return 0;
113}
114#endif
115
0563e191 116/* throttling disk I/O limits */
98f90dba
ZYW
117void bdrv_io_limits_disable(BlockDriverState *bs)
118{
119 bs->io_limits_enabled = false;
120
121 while (qemu_co_queue_next(&bs->throttled_reqs));
122
123 if (bs->block_timer) {
124 qemu_del_timer(bs->block_timer);
125 qemu_free_timer(bs->block_timer);
126 bs->block_timer = NULL;
127 }
128
129 bs->slice_start = 0;
130 bs->slice_end = 0;
131 bs->slice_time = 0;
132 memset(&bs->io_base, 0, sizeof(bs->io_base));
133}
134
0563e191
ZYW
135static void bdrv_block_timer(void *opaque)
136{
137 BlockDriverState *bs = opaque;
138
139 qemu_co_queue_next(&bs->throttled_reqs);
140}
141
142void bdrv_io_limits_enable(BlockDriverState *bs)
143{
144 qemu_co_queue_init(&bs->throttled_reqs);
145 bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
146 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
147 bs->slice_start = qemu_get_clock_ns(vm_clock);
148 bs->slice_end = bs->slice_start + bs->slice_time;
149 memset(&bs->io_base, 0, sizeof(bs->io_base));
150 bs->io_limits_enabled = true;
151}
152
153bool bdrv_io_limits_enabled(BlockDriverState *bs)
154{
155 BlockIOLimit *io_limits = &bs->io_limits;
156 return io_limits->bps[BLOCK_IO_LIMIT_READ]
157 || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
158 || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
159 || io_limits->iops[BLOCK_IO_LIMIT_READ]
160 || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
161 || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
162}
163
98f90dba
ZYW
164static void bdrv_io_limits_intercept(BlockDriverState *bs,
165 bool is_write, int nb_sectors)
166{
167 int64_t wait_time = -1;
168
169 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
170 qemu_co_queue_wait(&bs->throttled_reqs);
171 }
172
173 /* In fact, we hope to keep each request's timing, in FIFO mode. The next
174 * throttled requests will not be dequeued until the current request is
175 * allowed to be serviced. So if the current request still exceeds the
176 * limits, it will be inserted to the head. All requests followed it will
177 * be still in throttled_reqs queue.
178 */
179
180 while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
181 qemu_mod_timer(bs->block_timer,
182 wait_time + qemu_get_clock_ns(vm_clock));
183 qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
184 }
185
186 qemu_co_queue_next(&bs->throttled_reqs);
187}
188
9e0b22f4
SH
189/* check if the path starts with "<protocol>:" */
190static int path_has_protocol(const char *path)
191{
192#ifdef _WIN32
193 if (is_windows_drive(path) ||
194 is_windows_drive_prefix(path)) {
195 return 0;
196 }
197#endif
198
199 return strchr(path, ':') != NULL;
200}
201
83f64091 202int path_is_absolute(const char *path)
3b0d4f61 203{
83f64091 204 const char *p;
21664424
FB
205#ifdef _WIN32
206 /* specific case for names like: "\\.\d:" */
207 if (*path == '/' || *path == '\\')
208 return 1;
209#endif
83f64091
FB
210 p = strchr(path, ':');
211 if (p)
212 p++;
213 else
214 p = path;
3b9f94e1
FB
215#ifdef _WIN32
216 return (*p == '/' || *p == '\\');
217#else
218 return (*p == '/');
219#endif
3b0d4f61
FB
220}
221
83f64091
FB
222/* if filename is absolute, just copy it to dest. Otherwise, build a
223 path to it by considering it is relative to base_path. URL are
224 supported. */
225void path_combine(char *dest, int dest_size,
226 const char *base_path,
227 const char *filename)
3b0d4f61 228{
83f64091
FB
229 const char *p, *p1;
230 int len;
231
232 if (dest_size <= 0)
233 return;
234 if (path_is_absolute(filename)) {
235 pstrcpy(dest, dest_size, filename);
236 } else {
237 p = strchr(base_path, ':');
238 if (p)
239 p++;
240 else
241 p = base_path;
3b9f94e1
FB
242 p1 = strrchr(base_path, '/');
243#ifdef _WIN32
244 {
245 const char *p2;
246 p2 = strrchr(base_path, '\\');
247 if (!p1 || p2 > p1)
248 p1 = p2;
249 }
250#endif
83f64091
FB
251 if (p1)
252 p1++;
253 else
254 p1 = base_path;
255 if (p1 > p)
256 p = p1;
257 len = p - base_path;
258 if (len > dest_size - 1)
259 len = dest_size - 1;
260 memcpy(dest, base_path, len);
261 dest[len] = '\0';
262 pstrcat(dest, dest_size, filename);
3b0d4f61 263 }
3b0d4f61
FB
264}
265
5efa9d5a 266void bdrv_register(BlockDriver *bdrv)
ea2384d3 267{
8c5873d6
SH
268 /* Block drivers without coroutine functions need emulation */
269 if (!bdrv->bdrv_co_readv) {
f9f05dc5
KW
270 bdrv->bdrv_co_readv = bdrv_co_readv_em;
271 bdrv->bdrv_co_writev = bdrv_co_writev_em;
272
f8c35c1d
SH
273 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
274 * the block driver lacks aio we need to emulate that too.
275 */
f9f05dc5
KW
276 if (!bdrv->bdrv_aio_readv) {
277 /* add AIO emulation layer */
278 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
279 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
f9f05dc5 280 }
83f64091 281 }
b2e12bc6 282
8a22f02a 283 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
ea2384d3 284}
b338082b
FB
285
286/* create a new block device (by default it is empty) */
287BlockDriverState *bdrv_new(const char *device_name)
288{
1b7bdbc1 289 BlockDriverState *bs;
b338082b 290
7267c094 291 bs = g_malloc0(sizeof(BlockDriverState));
b338082b 292 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
ea2384d3 293 if (device_name[0] != '\0') {
1b7bdbc1 294 QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
ea2384d3 295 }
28a7282a 296 bdrv_iostatus_disable(bs);
b338082b
FB
297 return bs;
298}
299
ea2384d3
FB
300BlockDriver *bdrv_find_format(const char *format_name)
301{
302 BlockDriver *drv1;
8a22f02a
SH
303 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
304 if (!strcmp(drv1->format_name, format_name)) {
ea2384d3 305 return drv1;
8a22f02a 306 }
ea2384d3
FB
307 }
308 return NULL;
309}
310
eb852011
MA
311static int bdrv_is_whitelisted(BlockDriver *drv)
312{
313 static const char *whitelist[] = {
314 CONFIG_BDRV_WHITELIST
315 };
316 const char **p;
317
318 if (!whitelist[0])
319 return 1; /* no whitelist, anything goes */
320
321 for (p = whitelist; *p; p++) {
322 if (!strcmp(drv->format_name, *p)) {
323 return 1;
324 }
325 }
326 return 0;
327}
328
329BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
330{
331 BlockDriver *drv = bdrv_find_format(format_name);
332 return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
333}
334
0e7e1989
KW
335int bdrv_create(BlockDriver *drv, const char* filename,
336 QEMUOptionParameter *options)
ea2384d3
FB
337{
338 if (!drv->bdrv_create)
339 return -ENOTSUP;
0e7e1989
KW
340
341 return drv->bdrv_create(filename, options);
ea2384d3
FB
342}
343
84a12e66
CH
344int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
345{
346 BlockDriver *drv;
347
b50cbabc 348 drv = bdrv_find_protocol(filename);
84a12e66 349 if (drv == NULL) {
16905d71 350 return -ENOENT;
84a12e66
CH
351 }
352
353 return bdrv_create(drv, filename, options);
354}
355
d5249393 356#ifdef _WIN32
95389c86 357void get_tmp_filename(char *filename, int size)
d5249393 358{
3b9f94e1 359 char temp_dir[MAX_PATH];
3b46e624 360
3b9f94e1
FB
361 GetTempPath(MAX_PATH, temp_dir);
362 GetTempFileName(temp_dir, "qem", 0, filename);
d5249393
FB
363}
364#else
95389c86 365void get_tmp_filename(char *filename, int size)
fc01f7e7 366{
67b915a5 367 int fd;
7ccfb2eb 368 const char *tmpdir;
d5249393 369 /* XXX: race condition possible */
0badc1ee
AJ
370 tmpdir = getenv("TMPDIR");
371 if (!tmpdir)
372 tmpdir = "/tmp";
373 snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
ea2384d3
FB
374 fd = mkstemp(filename);
375 close(fd);
376}
d5249393 377#endif
fc01f7e7 378
84a12e66
CH
379/*
380 * Detect host devices. By convention, /dev/cdrom[N] is always
381 * recognized as a host CDROM.
382 */
383static BlockDriver *find_hdev_driver(const char *filename)
384{
385 int score_max = 0, score;
386 BlockDriver *drv = NULL, *d;
387
388 QLIST_FOREACH(d, &bdrv_drivers, list) {
389 if (d->bdrv_probe_device) {
390 score = d->bdrv_probe_device(filename);
391 if (score > score_max) {
392 score_max = score;
393 drv = d;
394 }
395 }
396 }
397
398 return drv;
399}
400
b50cbabc 401BlockDriver *bdrv_find_protocol(const char *filename)
83f64091
FB
402{
403 BlockDriver *drv1;
404 char protocol[128];
1cec71e3 405 int len;
83f64091 406 const char *p;
19cb3738 407
66f82cee
KW
408 /* TODO Drivers without bdrv_file_open must be specified explicitly */
409
39508e7a
CH
410 /*
411 * XXX(hch): we really should not let host device detection
412 * override an explicit protocol specification, but moving this
413 * later breaks access to device names with colons in them.
414 * Thanks to the brain-dead persistent naming schemes on udev-
415 * based Linux systems those actually are quite common.
416 */
417 drv1 = find_hdev_driver(filename);
418 if (drv1) {
419 return drv1;
420 }
421
9e0b22f4 422 if (!path_has_protocol(filename)) {
39508e7a 423 return bdrv_find_format("file");
84a12e66 424 }
9e0b22f4
SH
425 p = strchr(filename, ':');
426 assert(p != NULL);
1cec71e3
AL
427 len = p - filename;
428 if (len > sizeof(protocol) - 1)
429 len = sizeof(protocol) - 1;
430 memcpy(protocol, filename, len);
431 protocol[len] = '\0';
8a22f02a 432 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
5fafdf24 433 if (drv1->protocol_name &&
8a22f02a 434 !strcmp(drv1->protocol_name, protocol)) {
83f64091 435 return drv1;
8a22f02a 436 }
83f64091
FB
437 }
438 return NULL;
439}
440
c98ac35d 441static int find_image_format(const char *filename, BlockDriver **pdrv)
f3a5d3f8
CH
442{
443 int ret, score, score_max;
444 BlockDriver *drv1, *drv;
445 uint8_t buf[2048];
446 BlockDriverState *bs;
447
f5edb014 448 ret = bdrv_file_open(&bs, filename, 0);
c98ac35d
SW
449 if (ret < 0) {
450 *pdrv = NULL;
451 return ret;
452 }
f8ea0b00 453
08a00559
KW
454 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
455 if (bs->sg || !bdrv_is_inserted(bs)) {
1a396859 456 bdrv_delete(bs);
c98ac35d
SW
457 drv = bdrv_find_format("raw");
458 if (!drv) {
459 ret = -ENOENT;
460 }
461 *pdrv = drv;
462 return ret;
1a396859 463 }
f8ea0b00 464
83f64091
FB
465 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
466 bdrv_delete(bs);
467 if (ret < 0) {
c98ac35d
SW
468 *pdrv = NULL;
469 return ret;
83f64091
FB
470 }
471
ea2384d3 472 score_max = 0;
84a12e66 473 drv = NULL;
8a22f02a 474 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
83f64091
FB
475 if (drv1->bdrv_probe) {
476 score = drv1->bdrv_probe(buf, ret, filename);
477 if (score > score_max) {
478 score_max = score;
479 drv = drv1;
480 }
0849bf08 481 }
fc01f7e7 482 }
c98ac35d
SW
483 if (!drv) {
484 ret = -ENOENT;
485 }
486 *pdrv = drv;
487 return ret;
ea2384d3
FB
488}
489
51762288
SH
490/**
491 * Set the current 'total_sectors' value
492 */
493static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
494{
495 BlockDriver *drv = bs->drv;
496
396759ad
NB
497 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
498 if (bs->sg)
499 return 0;
500
51762288
SH
501 /* query actual device if possible, otherwise just trust the hint */
502 if (drv->bdrv_getlength) {
503 int64_t length = drv->bdrv_getlength(bs);
504 if (length < 0) {
505 return length;
506 }
507 hint = length >> BDRV_SECTOR_BITS;
508 }
509
510 bs->total_sectors = hint;
511 return 0;
512}
513
c3993cdc
SH
514/**
515 * Set open flags for a given cache mode
516 *
517 * Return 0 on success, -1 if the cache mode was invalid.
518 */
519int bdrv_parse_cache_flags(const char *mode, int *flags)
520{
521 *flags &= ~BDRV_O_CACHE_MASK;
522
523 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
524 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
92196b2f
SH
525 } else if (!strcmp(mode, "directsync")) {
526 *flags |= BDRV_O_NOCACHE;
c3993cdc
SH
527 } else if (!strcmp(mode, "writeback")) {
528 *flags |= BDRV_O_CACHE_WB;
529 } else if (!strcmp(mode, "unsafe")) {
530 *flags |= BDRV_O_CACHE_WB;
531 *flags |= BDRV_O_NO_FLUSH;
532 } else if (!strcmp(mode, "writethrough")) {
533 /* this is the default */
534 } else {
535 return -1;
536 }
537
538 return 0;
539}
540
57915332
KW
541/*
542 * Common part for opening disk images and files
543 */
544static int bdrv_open_common(BlockDriverState *bs, const char *filename,
545 int flags, BlockDriver *drv)
546{
547 int ret, open_flags;
548
549 assert(drv != NULL);
550
28dcee10
SH
551 trace_bdrv_open_common(bs, filename, flags, drv->format_name);
552
66f82cee 553 bs->file = NULL;
51762288 554 bs->total_sectors = 0;
57915332
KW
555 bs->encrypted = 0;
556 bs->valid_key = 0;
03f541bd 557 bs->sg = 0;
57915332 558 bs->open_flags = flags;
03f541bd 559 bs->growable = 0;
57915332
KW
560 bs->buffer_alignment = 512;
561
562 pstrcpy(bs->filename, sizeof(bs->filename), filename);
03f541bd 563 bs->backing_file[0] = '\0';
57915332
KW
564
565 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
566 return -ENOTSUP;
567 }
568
569 bs->drv = drv;
7267c094 570 bs->opaque = g_malloc0(drv->instance_size);
57915332 571
03f541bd 572 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
57915332
KW
573
574 /*
575 * Clear flags that are internal to the block layer before opening the
576 * image.
577 */
578 open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
579
580 /*
ebabb67a 581 * Snapshots should be writable.
57915332
KW
582 */
583 if (bs->is_temporary) {
584 open_flags |= BDRV_O_RDWR;
585 }
586
e7c63796
SH
587 bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
588
66f82cee
KW
589 /* Open the image, either directly or using a protocol */
590 if (drv->bdrv_file_open) {
591 ret = drv->bdrv_file_open(bs, filename, open_flags);
592 } else {
593 ret = bdrv_file_open(&bs->file, filename, open_flags);
594 if (ret >= 0) {
595 ret = drv->bdrv_open(bs, open_flags);
596 }
597 }
598
57915332
KW
599 if (ret < 0) {
600 goto free_and_fail;
601 }
602
51762288
SH
603 ret = refresh_total_sectors(bs, bs->total_sectors);
604 if (ret < 0) {
605 goto free_and_fail;
57915332 606 }
51762288 607
57915332
KW
608#ifndef _WIN32
609 if (bs->is_temporary) {
610 unlink(filename);
611 }
612#endif
613 return 0;
614
615free_and_fail:
66f82cee
KW
616 if (bs->file) {
617 bdrv_delete(bs->file);
618 bs->file = NULL;
619 }
7267c094 620 g_free(bs->opaque);
57915332
KW
621 bs->opaque = NULL;
622 bs->drv = NULL;
623 return ret;
624}
625
b6ce07aa
KW
626/*
627 * Opens a file using a protocol (file, host_device, nbd, ...)
628 */
83f64091 629int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
ea2384d3 630{
83f64091 631 BlockDriverState *bs;
6db95603 632 BlockDriver *drv;
83f64091
FB
633 int ret;
634
b50cbabc 635 drv = bdrv_find_protocol(filename);
6db95603
CH
636 if (!drv) {
637 return -ENOENT;
638 }
639
83f64091 640 bs = bdrv_new("");
b6ce07aa 641 ret = bdrv_open_common(bs, filename, flags, drv);
83f64091
FB
642 if (ret < 0) {
643 bdrv_delete(bs);
644 return ret;
3b0d4f61 645 }
71d0770c 646 bs->growable = 1;
83f64091
FB
647 *pbs = bs;
648 return 0;
649}
650
b6ce07aa
KW
651/*
652 * Opens a disk image (raw, qcow2, vmdk, ...)
653 */
d6e9098e
KW
654int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
655 BlockDriver *drv)
ea2384d3 656{
b6ce07aa 657 int ret;
2b572816 658 char tmp_filename[PATH_MAX];
712e7874 659
83f64091 660 if (flags & BDRV_O_SNAPSHOT) {
ea2384d3
FB
661 BlockDriverState *bs1;
662 int64_t total_size;
7c96d46e 663 int is_protocol = 0;
91a073a9
KW
664 BlockDriver *bdrv_qcow2;
665 QEMUOptionParameter *options;
b6ce07aa 666 char backing_filename[PATH_MAX];
3b46e624 667
ea2384d3
FB
668 /* if snapshot, we create a temporary backing file and open it
669 instead of opening 'filename' directly */
33e3963e 670
ea2384d3
FB
671 /* if there is a backing file, use it */
672 bs1 = bdrv_new("");
d6e9098e 673 ret = bdrv_open(bs1, filename, 0, drv);
51d7c00c 674 if (ret < 0) {
ea2384d3 675 bdrv_delete(bs1);
51d7c00c 676 return ret;
ea2384d3 677 }
3e82990b 678 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
7c96d46e
AL
679
680 if (bs1->drv && bs1->drv->protocol_name)
681 is_protocol = 1;
682
ea2384d3 683 bdrv_delete(bs1);
3b46e624 684
ea2384d3 685 get_tmp_filename(tmp_filename, sizeof(tmp_filename));
7c96d46e
AL
686
687 /* Real path is meaningless for protocols */
688 if (is_protocol)
689 snprintf(backing_filename, sizeof(backing_filename),
690 "%s", filename);
114cdfa9
KS
691 else if (!realpath(filename, backing_filename))
692 return -errno;
7c96d46e 693
91a073a9
KW
694 bdrv_qcow2 = bdrv_find_format("qcow2");
695 options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
696
3e82990b 697 set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
91a073a9
KW
698 set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
699 if (drv) {
700 set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
701 drv->format_name);
702 }
703
704 ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
d748768c 705 free_option_parameters(options);
51d7c00c
AL
706 if (ret < 0) {
707 return ret;
ea2384d3 708 }
91a073a9 709
ea2384d3 710 filename = tmp_filename;
91a073a9 711 drv = bdrv_qcow2;
ea2384d3
FB
712 bs->is_temporary = 1;
713 }
712e7874 714
b6ce07aa 715 /* Find the right image format driver */
6db95603 716 if (!drv) {
c98ac35d 717 ret = find_image_format(filename, &drv);
51d7c00c 718 }
6987307c 719
51d7c00c 720 if (!drv) {
51d7c00c 721 goto unlink_and_fail;
ea2384d3 722 }
b6ce07aa
KW
723
724 /* Open the image */
725 ret = bdrv_open_common(bs, filename, flags, drv);
726 if (ret < 0) {
6987307c
CH
727 goto unlink_and_fail;
728 }
729
b6ce07aa
KW
730 /* If there is a backing file, use it */
731 if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
732 char backing_filename[PATH_MAX];
733 int back_flags;
734 BlockDriver *back_drv = NULL;
735
736 bs->backing_hd = bdrv_new("");
df2dbb4a
SH
737
738 if (path_has_protocol(bs->backing_file)) {
739 pstrcpy(backing_filename, sizeof(backing_filename),
740 bs->backing_file);
741 } else {
742 path_combine(backing_filename, sizeof(backing_filename),
743 filename, bs->backing_file);
744 }
745
746 if (bs->backing_format[0] != '\0') {
b6ce07aa 747 back_drv = bdrv_find_format(bs->backing_format);
df2dbb4a 748 }
b6ce07aa
KW
749
750 /* backing files always opened read-only */
751 back_flags =
752 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
753
754 ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
755 if (ret < 0) {
756 bdrv_close(bs);
757 return ret;
758 }
759 if (bs->is_temporary) {
760 bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
761 } else {
762 /* base image inherits from "parent" */
763 bs->backing_hd->keep_read_only = bs->keep_read_only;
764 }
765 }
766
767 if (!bdrv_key_required(bs)) {
7d4b4ba5 768 bdrv_dev_change_media_cb(bs, true);
b6ce07aa
KW
769 }
770
98f90dba
ZYW
771 /* throttling disk I/O limits */
772 if (bs->io_limits_enabled) {
773 bdrv_io_limits_enable(bs);
774 }
775
b6ce07aa
KW
776 return 0;
777
778unlink_and_fail:
779 if (bs->is_temporary) {
780 unlink(filename);
781 }
782 return ret;
783}
784
fc01f7e7
FB
785void bdrv_close(BlockDriverState *bs)
786{
19cb3738 787 if (bs->drv) {
f9092b10
MA
788 if (bs == bs_snapshots) {
789 bs_snapshots = NULL;
790 }
557df6ac 791 if (bs->backing_hd) {
ea2384d3 792 bdrv_delete(bs->backing_hd);
557df6ac
SH
793 bs->backing_hd = NULL;
794 }
ea2384d3 795 bs->drv->bdrv_close(bs);
7267c094 796 g_free(bs->opaque);
ea2384d3
FB
797#ifdef _WIN32
798 if (bs->is_temporary) {
799 unlink(bs->filename);
800 }
67b915a5 801#endif
ea2384d3
FB
802 bs->opaque = NULL;
803 bs->drv = NULL;
b338082b 804
66f82cee
KW
805 if (bs->file != NULL) {
806 bdrv_close(bs->file);
807 }
808
7d4b4ba5 809 bdrv_dev_change_media_cb(bs, false);
b338082b 810 }
98f90dba
ZYW
811
812 /*throttling disk I/O limits*/
813 if (bs->io_limits_enabled) {
814 bdrv_io_limits_disable(bs);
815 }
b338082b
FB
816}
817
2bc93fed
MK
818void bdrv_close_all(void)
819{
820 BlockDriverState *bs;
821
822 QTAILQ_FOREACH(bs, &bdrv_states, list) {
823 bdrv_close(bs);
824 }
825}
826
d22b2f41
RH
827/* make a BlockDriverState anonymous by removing from bdrv_state list.
828 Also, NULL terminate the device_name to prevent double remove */
829void bdrv_make_anon(BlockDriverState *bs)
830{
831 if (bs->device_name[0] != '\0') {
832 QTAILQ_REMOVE(&bdrv_states, bs, list);
833 }
834 bs->device_name[0] = '\0';
835}
836
b338082b
FB
837void bdrv_delete(BlockDriverState *bs)
838{
fa879d62 839 assert(!bs->dev);
18846dee 840
1b7bdbc1 841 /* remove from list, if necessary */
d22b2f41 842 bdrv_make_anon(bs);
34c6f050 843
b338082b 844 bdrv_close(bs);
66f82cee
KW
845 if (bs->file != NULL) {
846 bdrv_delete(bs->file);
847 }
848
f9092b10 849 assert(bs != bs_snapshots);
7267c094 850 g_free(bs);
fc01f7e7
FB
851}
852
fa879d62
MA
853int bdrv_attach_dev(BlockDriverState *bs, void *dev)
854/* TODO change to DeviceState *dev when all users are qdevified */
18846dee 855{
fa879d62 856 if (bs->dev) {
18846dee
MA
857 return -EBUSY;
858 }
fa879d62 859 bs->dev = dev;
28a7282a 860 bdrv_iostatus_reset(bs);
18846dee
MA
861 return 0;
862}
863
fa879d62
MA
864/* TODO qdevified devices don't use this, remove when devices are qdevified */
865void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
18846dee 866{
fa879d62
MA
867 if (bdrv_attach_dev(bs, dev) < 0) {
868 abort();
869 }
870}
871
872void bdrv_detach_dev(BlockDriverState *bs, void *dev)
873/* TODO change to DeviceState *dev when all users are qdevified */
874{
875 assert(bs->dev == dev);
876 bs->dev = NULL;
0e49de52
MA
877 bs->dev_ops = NULL;
878 bs->dev_opaque = NULL;
29e05f20 879 bs->buffer_alignment = 512;
18846dee
MA
880}
881
fa879d62
MA
882/* TODO change to return DeviceState * when all users are qdevified */
883void *bdrv_get_attached_dev(BlockDriverState *bs)
18846dee 884{
fa879d62 885 return bs->dev;
18846dee
MA
886}
887
0e49de52
MA
888void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
889 void *opaque)
890{
891 bs->dev_ops = ops;
892 bs->dev_opaque = opaque;
2c6942fa
MA
893 if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
894 bs_snapshots = NULL;
895 }
0e49de52
MA
896}
897
7d4b4ba5 898static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
0e49de52 899{
145feb17 900 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
7d4b4ba5 901 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
145feb17
MA
902 }
903}
904
2c6942fa
MA
905bool bdrv_dev_has_removable_media(BlockDriverState *bs)
906{
907 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
908}
909
025ccaa7
PB
910void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
911{
912 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
913 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
914 }
915}
916
e4def80b
MA
917bool bdrv_dev_is_tray_open(BlockDriverState *bs)
918{
919 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
920 return bs->dev_ops->is_tray_open(bs->dev_opaque);
921 }
922 return false;
923}
924
145feb17
MA
925static void bdrv_dev_resize_cb(BlockDriverState *bs)
926{
927 if (bs->dev_ops && bs->dev_ops->resize_cb) {
928 bs->dev_ops->resize_cb(bs->dev_opaque);
0e49de52
MA
929 }
930}
931
f107639a
MA
932bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
933{
934 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
935 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
936 }
937 return false;
938}
939
e97fc193
AL
940/*
941 * Run consistency checks on an image
942 *
e076f338 943 * Returns 0 if the check could be completed (it doesn't mean that the image is
a1c7273b 944 * free of errors) or -errno when an internal error occurred. The results of the
e076f338 945 * check are stored in res.
e97fc193 946 */
e076f338 947int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
e97fc193
AL
948{
949 if (bs->drv->bdrv_check == NULL) {
950 return -ENOTSUP;
951 }
952
e076f338 953 memset(res, 0, sizeof(*res));
9ac228e0 954 return bs->drv->bdrv_check(bs, res);
e97fc193
AL
955}
956
8a426614
KW
957#define COMMIT_BUF_SECTORS 2048
958
33e3963e
FB
959/* commit COW file into the raw image */
960int bdrv_commit(BlockDriverState *bs)
961{
19cb3738 962 BlockDriver *drv = bs->drv;
ee181196 963 BlockDriver *backing_drv;
8a426614
KW
964 int64_t sector, total_sectors;
965 int n, ro, open_flags;
4dca4b63 966 int ret = 0, rw_ret = 0;
8a426614 967 uint8_t *buf;
4dca4b63
NS
968 char filename[1024];
969 BlockDriverState *bs_rw, *bs_ro;
33e3963e 970
19cb3738
FB
971 if (!drv)
972 return -ENOMEDIUM;
4dca4b63
NS
973
974 if (!bs->backing_hd) {
975 return -ENOTSUP;
33e3963e
FB
976 }
977
4dca4b63
NS
978 if (bs->backing_hd->keep_read_only) {
979 return -EACCES;
980 }
ee181196
KW
981
982 backing_drv = bs->backing_hd->drv;
4dca4b63
NS
983 ro = bs->backing_hd->read_only;
984 strncpy(filename, bs->backing_hd->filename, sizeof(filename));
985 open_flags = bs->backing_hd->open_flags;
986
987 if (ro) {
988 /* re-open as RW */
989 bdrv_delete(bs->backing_hd);
990 bs->backing_hd = NULL;
991 bs_rw = bdrv_new("");
ee181196
KW
992 rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
993 backing_drv);
4dca4b63
NS
994 if (rw_ret < 0) {
995 bdrv_delete(bs_rw);
996 /* try to re-open read-only */
997 bs_ro = bdrv_new("");
ee181196
KW
998 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
999 backing_drv);
4dca4b63
NS
1000 if (ret < 0) {
1001 bdrv_delete(bs_ro);
1002 /* drive not functional anymore */
1003 bs->drv = NULL;
1004 return ret;
1005 }
1006 bs->backing_hd = bs_ro;
1007 return rw_ret;
1008 }
1009 bs->backing_hd = bs_rw;
ea2384d3 1010 }
33e3963e 1011
6ea44308 1012 total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
7267c094 1013 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
8a426614
KW
1014
1015 for (sector = 0; sector < total_sectors; sector += n) {
05c4af54 1016 if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
8a426614
KW
1017
1018 if (bdrv_read(bs, sector, buf, n) != 0) {
1019 ret = -EIO;
1020 goto ro_cleanup;
1021 }
1022
1023 if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1024 ret = -EIO;
1025 goto ro_cleanup;
1026 }
ea2384d3 1027 }
33e3963e 1028 }
95389c86 1029
1d44952f
CH
1030 if (drv->bdrv_make_empty) {
1031 ret = drv->bdrv_make_empty(bs);
1032 bdrv_flush(bs);
1033 }
95389c86 1034
3f5075ae
CH
1035 /*
1036 * Make sure all data we wrote to the backing device is actually
1037 * stable on disk.
1038 */
1039 if (bs->backing_hd)
1040 bdrv_flush(bs->backing_hd);
4dca4b63
NS
1041
1042ro_cleanup:
7267c094 1043 g_free(buf);
4dca4b63
NS
1044
1045 if (ro) {
1046 /* re-open as RO */
1047 bdrv_delete(bs->backing_hd);
1048 bs->backing_hd = NULL;
1049 bs_ro = bdrv_new("");
ee181196
KW
1050 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1051 backing_drv);
4dca4b63
NS
1052 if (ret < 0) {
1053 bdrv_delete(bs_ro);
1054 /* drive not functional anymore */
1055 bs->drv = NULL;
1056 return ret;
1057 }
1058 bs->backing_hd = bs_ro;
1059 bs->backing_hd->keep_read_only = 0;
1060 }
1061
1d44952f 1062 return ret;
33e3963e
FB
1063}
1064
6ab4b5ab
MA
1065void bdrv_commit_all(void)
1066{
1067 BlockDriverState *bs;
1068
1069 QTAILQ_FOREACH(bs, &bdrv_states, list) {
1070 bdrv_commit(bs);
1071 }
1072}
1073
756e6736
KW
1074/*
1075 * Return values:
1076 * 0 - success
1077 * -EINVAL - backing format specified, but no file
1078 * -ENOSPC - can't update the backing file because no space is left in the
1079 * image file header
1080 * -ENOTSUP - format driver doesn't support changing the backing file
1081 */
1082int bdrv_change_backing_file(BlockDriverState *bs,
1083 const char *backing_file, const char *backing_fmt)
1084{
1085 BlockDriver *drv = bs->drv;
1086
1087 if (drv->bdrv_change_backing_file != NULL) {
1088 return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1089 } else {
1090 return -ENOTSUP;
1091 }
1092}
1093
71d0770c
AL
1094static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1095 size_t size)
1096{
1097 int64_t len;
1098
1099 if (!bdrv_is_inserted(bs))
1100 return -ENOMEDIUM;
1101
1102 if (bs->growable)
1103 return 0;
1104
1105 len = bdrv_getlength(bs);
1106
fbb7b4e0
KW
1107 if (offset < 0)
1108 return -EIO;
1109
1110 if ((offset > len) || (len - offset < size))
71d0770c
AL
1111 return -EIO;
1112
1113 return 0;
1114}
1115
1116static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1117 int nb_sectors)
1118{
eb5a3165
JS
1119 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1120 nb_sectors * BDRV_SECTOR_SIZE);
71d0770c
AL
1121}
1122
1c9805a3
SH
1123typedef struct RwCo {
1124 BlockDriverState *bs;
1125 int64_t sector_num;
1126 int nb_sectors;
1127 QEMUIOVector *qiov;
1128 bool is_write;
1129 int ret;
1130} RwCo;
1131
1132static void coroutine_fn bdrv_rw_co_entry(void *opaque)
fc01f7e7 1133{
1c9805a3 1134 RwCo *rwco = opaque;
ea2384d3 1135
1c9805a3
SH
1136 if (!rwco->is_write) {
1137 rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1138 rwco->nb_sectors, rwco->qiov);
1139 } else {
1140 rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1141 rwco->nb_sectors, rwco->qiov);
1142 }
1143}
e7a8a783 1144
1c9805a3
SH
1145/*
1146 * Process a synchronous request using coroutines
1147 */
1148static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1149 int nb_sectors, bool is_write)
1150{
1151 QEMUIOVector qiov;
1152 struct iovec iov = {
1153 .iov_base = (void *)buf,
1154 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1155 };
1156 Coroutine *co;
1157 RwCo rwco = {
1158 .bs = bs,
1159 .sector_num = sector_num,
1160 .nb_sectors = nb_sectors,
1161 .qiov = &qiov,
1162 .is_write = is_write,
1163 .ret = NOT_DONE,
1164 };
e7a8a783 1165
1c9805a3 1166 qemu_iovec_init_external(&qiov, &iov, 1);
e7a8a783 1167
1c9805a3
SH
1168 if (qemu_in_coroutine()) {
1169 /* Fast-path if already in coroutine context */
1170 bdrv_rw_co_entry(&rwco);
1171 } else {
1172 co = qemu_coroutine_create(bdrv_rw_co_entry);
1173 qemu_coroutine_enter(co, &rwco);
1174 while (rwco.ret == NOT_DONE) {
1175 qemu_aio_wait();
1176 }
1177 }
1178 return rwco.ret;
1179}
b338082b 1180
1c9805a3
SH
1181/* return < 0 if error. See bdrv_write() for the return codes */
1182int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1183 uint8_t *buf, int nb_sectors)
1184{
1185 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
fc01f7e7
FB
1186}
1187
7cd1e32a 1188static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
a55eb92c 1189 int nb_sectors, int dirty)
7cd1e32a
LS
1190{
1191 int64_t start, end;
c6d22830 1192 unsigned long val, idx, bit;
a55eb92c 1193
6ea44308 1194 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
c6d22830 1195 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
a55eb92c
JK
1196
1197 for (; start <= end; start++) {
c6d22830
JK
1198 idx = start / (sizeof(unsigned long) * 8);
1199 bit = start % (sizeof(unsigned long) * 8);
1200 val = bs->dirty_bitmap[idx];
1201 if (dirty) {
6d59fec1 1202 if (!(val & (1UL << bit))) {
aaa0eb75 1203 bs->dirty_count++;
6d59fec1 1204 val |= 1UL << bit;
aaa0eb75 1205 }
c6d22830 1206 } else {
6d59fec1 1207 if (val & (1UL << bit)) {
aaa0eb75 1208 bs->dirty_count--;
6d59fec1 1209 val &= ~(1UL << bit);
aaa0eb75 1210 }
c6d22830
JK
1211 }
1212 bs->dirty_bitmap[idx] = val;
7cd1e32a
LS
1213 }
1214}
1215
5fafdf24 1216/* Return < 0 if error. Important errors are:
19cb3738
FB
1217 -EIO generic I/O error (may happen for all errors)
1218 -ENOMEDIUM No media inserted.
1219 -EINVAL Invalid sector number or nb_sectors
1220 -EACCES Trying to write a read-only device
1221*/
5fafdf24 1222int bdrv_write(BlockDriverState *bs, int64_t sector_num,
fc01f7e7
FB
1223 const uint8_t *buf, int nb_sectors)
1224{
1c9805a3 1225 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
83f64091
FB
1226}
1227
eda578e5
AL
1228int bdrv_pread(BlockDriverState *bs, int64_t offset,
1229 void *buf, int count1)
83f64091 1230{
6ea44308 1231 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
83f64091
FB
1232 int len, nb_sectors, count;
1233 int64_t sector_num;
9a8c4cce 1234 int ret;
83f64091
FB
1235
1236 count = count1;
1237 /* first read to align to sector start */
6ea44308 1238 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
83f64091
FB
1239 if (len > count)
1240 len = count;
6ea44308 1241 sector_num = offset >> BDRV_SECTOR_BITS;
83f64091 1242 if (len > 0) {
9a8c4cce
KW
1243 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1244 return ret;
6ea44308 1245 memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
83f64091
FB
1246 count -= len;
1247 if (count == 0)
1248 return count1;
1249 sector_num++;
1250 buf += len;
1251 }
1252
1253 /* read the sectors "in place" */
6ea44308 1254 nb_sectors = count >> BDRV_SECTOR_BITS;
83f64091 1255 if (nb_sectors > 0) {
9a8c4cce
KW
1256 if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1257 return ret;
83f64091 1258 sector_num += nb_sectors;
6ea44308 1259 len = nb_sectors << BDRV_SECTOR_BITS;
83f64091
FB
1260 buf += len;
1261 count -= len;
1262 }
1263
1264 /* add data from the last sector */
1265 if (count > 0) {
9a8c4cce
KW
1266 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1267 return ret;
83f64091
FB
1268 memcpy(buf, tmp_buf, count);
1269 }
1270 return count1;
1271}
1272
eda578e5
AL
1273int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1274 const void *buf, int count1)
83f64091 1275{
6ea44308 1276 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
83f64091
FB
1277 int len, nb_sectors, count;
1278 int64_t sector_num;
9a8c4cce 1279 int ret;
83f64091
FB
1280
1281 count = count1;
1282 /* first write to align to sector start */
6ea44308 1283 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
83f64091
FB
1284 if (len > count)
1285 len = count;
6ea44308 1286 sector_num = offset >> BDRV_SECTOR_BITS;
83f64091 1287 if (len > 0) {
9a8c4cce
KW
1288 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1289 return ret;
6ea44308 1290 memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
9a8c4cce
KW
1291 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1292 return ret;
83f64091
FB
1293 count -= len;
1294 if (count == 0)
1295 return count1;
1296 sector_num++;
1297 buf += len;
1298 }
1299
1300 /* write the sectors "in place" */
6ea44308 1301 nb_sectors = count >> BDRV_SECTOR_BITS;
83f64091 1302 if (nb_sectors > 0) {
9a8c4cce
KW
1303 if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1304 return ret;
83f64091 1305 sector_num += nb_sectors;
6ea44308 1306 len = nb_sectors << BDRV_SECTOR_BITS;
83f64091
FB
1307 buf += len;
1308 count -= len;
1309 }
1310
1311 /* add data from the last sector */
1312 if (count > 0) {
9a8c4cce
KW
1313 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1314 return ret;
83f64091 1315 memcpy(tmp_buf, buf, count);
9a8c4cce
KW
1316 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1317 return ret;
83f64091
FB
1318 }
1319 return count1;
1320}
83f64091 1321
f08145fe
KW
1322/*
1323 * Writes to the file and ensures that no writes are reordered across this
1324 * request (acts as a barrier)
1325 *
1326 * Returns 0 on success, -errno in error cases.
1327 */
1328int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1329 const void *buf, int count)
1330{
1331 int ret;
1332
1333 ret = bdrv_pwrite(bs, offset, buf, count);
1334 if (ret < 0) {
1335 return ret;
1336 }
1337
92196b2f
SH
1338 /* No flush needed for cache modes that use O_DSYNC */
1339 if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
f08145fe
KW
1340 bdrv_flush(bs);
1341 }
1342
1343 return 0;
1344}
1345
c5fbe571
SH
1346/*
1347 * Handle a read request in coroutine context
1348 */
1349static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1350 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
da1fa91d
KW
1351{
1352 BlockDriver *drv = bs->drv;
1353
da1fa91d
KW
1354 if (!drv) {
1355 return -ENOMEDIUM;
1356 }
1357 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1358 return -EIO;
1359 }
1360
98f90dba
ZYW
1361 /* throttling disk read I/O */
1362 if (bs->io_limits_enabled) {
1363 bdrv_io_limits_intercept(bs, false, nb_sectors);
1364 }
1365
da1fa91d
KW
1366 return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1367}
1368
c5fbe571 1369int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
da1fa91d
KW
1370 int nb_sectors, QEMUIOVector *qiov)
1371{
c5fbe571 1372 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
da1fa91d 1373
c5fbe571
SH
1374 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov);
1375}
1376
1377/*
1378 * Handle a write request in coroutine context
1379 */
1380static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1381 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1382{
1383 BlockDriver *drv = bs->drv;
6b7cb247 1384 int ret;
da1fa91d
KW
1385
1386 if (!bs->drv) {
1387 return -ENOMEDIUM;
1388 }
1389 if (bs->read_only) {
1390 return -EACCES;
1391 }
1392 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1393 return -EIO;
1394 }
1395
98f90dba
ZYW
1396 /* throttling disk write I/O */
1397 if (bs->io_limits_enabled) {
1398 bdrv_io_limits_intercept(bs, true, nb_sectors);
1399 }
1400
6b7cb247
SH
1401 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1402
da1fa91d
KW
1403 if (bs->dirty_bitmap) {
1404 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1405 }
1406
1407 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1408 bs->wr_highest_sector = sector_num + nb_sectors - 1;
1409 }
1410
6b7cb247 1411 return ret;
da1fa91d
KW
1412}
1413
c5fbe571
SH
1414int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1415 int nb_sectors, QEMUIOVector *qiov)
1416{
1417 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1418
1419 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov);
1420}
1421
83f64091
FB
1422/**
1423 * Truncate file to 'offset' bytes (needed only for file protocols)
1424 */
1425int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1426{
1427 BlockDriver *drv = bs->drv;
51762288 1428 int ret;
83f64091 1429 if (!drv)
19cb3738 1430 return -ENOMEDIUM;
83f64091
FB
1431 if (!drv->bdrv_truncate)
1432 return -ENOTSUP;
59f2689d
NS
1433 if (bs->read_only)
1434 return -EACCES;
8591675f
MT
1435 if (bdrv_in_use(bs))
1436 return -EBUSY;
51762288
SH
1437 ret = drv->bdrv_truncate(bs, offset);
1438 if (ret == 0) {
1439 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
145feb17 1440 bdrv_dev_resize_cb(bs);
51762288
SH
1441 }
1442 return ret;
83f64091
FB
1443}
1444
4a1d5e1f
FZ
1445/**
1446 * Length of a allocated file in bytes. Sparse files are counted by actual
1447 * allocated space. Return < 0 if error or unknown.
1448 */
1449int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
1450{
1451 BlockDriver *drv = bs->drv;
1452 if (!drv) {
1453 return -ENOMEDIUM;
1454 }
1455 if (drv->bdrv_get_allocated_file_size) {
1456 return drv->bdrv_get_allocated_file_size(bs);
1457 }
1458 if (bs->file) {
1459 return bdrv_get_allocated_file_size(bs->file);
1460 }
1461 return -ENOTSUP;
1462}
1463
83f64091
FB
1464/**
1465 * Length of a file in bytes. Return < 0 if error or unknown.
1466 */
1467int64_t bdrv_getlength(BlockDriverState *bs)
1468{
1469 BlockDriver *drv = bs->drv;
1470 if (!drv)
19cb3738 1471 return -ENOMEDIUM;
51762288 1472
2c6942fa 1473 if (bs->growable || bdrv_dev_has_removable_media(bs)) {
46a4e4e6
SH
1474 if (drv->bdrv_getlength) {
1475 return drv->bdrv_getlength(bs);
1476 }
83f64091 1477 }
46a4e4e6 1478 return bs->total_sectors * BDRV_SECTOR_SIZE;
fc01f7e7
FB
1479}
1480
19cb3738 1481/* return 0 as number of sectors if no device present or error */
96b8f136 1482void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
fc01f7e7 1483{
19cb3738
FB
1484 int64_t length;
1485 length = bdrv_getlength(bs);
1486 if (length < 0)
1487 length = 0;
1488 else
6ea44308 1489 length = length >> BDRV_SECTOR_BITS;
19cb3738 1490 *nb_sectors_ptr = length;
fc01f7e7 1491}
cf98951b 1492
f3d54fc4
AL
1493struct partition {
1494 uint8_t boot_ind; /* 0x80 - active */
1495 uint8_t head; /* starting head */
1496 uint8_t sector; /* starting sector */
1497 uint8_t cyl; /* starting cylinder */
1498 uint8_t sys_ind; /* What partition type */
1499 uint8_t end_head; /* end head */
1500 uint8_t end_sector; /* end sector */
1501 uint8_t end_cyl; /* end cylinder */
1502 uint32_t start_sect; /* starting sector counting from 0 */
1503 uint32_t nr_sects; /* nr of sectors in partition */
541dc0d4 1504} QEMU_PACKED;
f3d54fc4
AL
1505
1506/* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
1507static int guess_disk_lchs(BlockDriverState *bs,
1508 int *pcylinders, int *pheads, int *psectors)
1509{
eb5a3165 1510 uint8_t buf[BDRV_SECTOR_SIZE];
f3d54fc4
AL
1511 int ret, i, heads, sectors, cylinders;
1512 struct partition *p;
1513 uint32_t nr_sects;
a38131b6 1514 uint64_t nb_sectors;
f3d54fc4
AL
1515
1516 bdrv_get_geometry(bs, &nb_sectors);
1517
1518 ret = bdrv_read(bs, 0, buf, 1);
1519 if (ret < 0)
1520 return -1;
1521 /* test msdos magic */
1522 if (buf[510] != 0x55 || buf[511] != 0xaa)
1523 return -1;
1524 for(i = 0; i < 4; i++) {
1525 p = ((struct partition *)(buf + 0x1be)) + i;
1526 nr_sects = le32_to_cpu(p->nr_sects);
1527 if (nr_sects && p->end_head) {
1528 /* We make the assumption that the partition terminates on
1529 a cylinder boundary */
1530 heads = p->end_head + 1;
1531 sectors = p->end_sector & 63;
1532 if (sectors == 0)
1533 continue;
1534 cylinders = nb_sectors / (heads * sectors);
1535 if (cylinders < 1 || cylinders > 16383)
1536 continue;
1537 *pheads = heads;
1538 *psectors = sectors;
1539 *pcylinders = cylinders;
1540#if 0
1541 printf("guessed geometry: LCHS=%d %d %d\n",
1542 cylinders, heads, sectors);
1543#endif
1544 return 0;
1545 }
1546 }
1547 return -1;
1548}
1549
1550void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
1551{
1552 int translation, lba_detected = 0;
1553 int cylinders, heads, secs;
a38131b6 1554 uint64_t nb_sectors;
f3d54fc4
AL
1555
1556 /* if a geometry hint is available, use it */
1557 bdrv_get_geometry(bs, &nb_sectors);
1558 bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
1559 translation = bdrv_get_translation_hint(bs);
1560 if (cylinders != 0) {
1561 *pcyls = cylinders;
1562 *pheads = heads;
1563 *psecs = secs;
1564 } else {
1565 if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
1566 if (heads > 16) {
1567 /* if heads > 16, it means that a BIOS LBA
1568 translation was active, so the default
1569 hardware geometry is OK */
1570 lba_detected = 1;
1571 goto default_geometry;
1572 } else {
1573 *pcyls = cylinders;
1574 *pheads = heads;
1575 *psecs = secs;
1576 /* disable any translation to be in sync with
1577 the logical geometry */
1578 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
1579 bdrv_set_translation_hint(bs,
1580 BIOS_ATA_TRANSLATION_NONE);
1581 }
1582 }
1583 } else {
1584 default_geometry:
1585 /* if no geometry, use a standard physical disk geometry */
1586 cylinders = nb_sectors / (16 * 63);
1587
1588 if (cylinders > 16383)
1589 cylinders = 16383;
1590 else if (cylinders < 2)
1591 cylinders = 2;
1592 *pcyls = cylinders;
1593 *pheads = 16;
1594 *psecs = 63;
1595 if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
1596 if ((*pcyls * *pheads) <= 131072) {
1597 bdrv_set_translation_hint(bs,
1598 BIOS_ATA_TRANSLATION_LARGE);
1599 } else {
1600 bdrv_set_translation_hint(bs,
1601 BIOS_ATA_TRANSLATION_LBA);
1602 }
1603 }
1604 }
1605 bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
1606 }
1607}
1608
5fafdf24 1609void bdrv_set_geometry_hint(BlockDriverState *bs,
b338082b
FB
1610 int cyls, int heads, int secs)
1611{
1612 bs->cyls = cyls;
1613 bs->heads = heads;
1614 bs->secs = secs;
1615}
1616
46d4767d
FB
1617void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
1618{
1619 bs->translation = translation;
1620}
1621
5fafdf24 1622void bdrv_get_geometry_hint(BlockDriverState *bs,
b338082b
FB
1623 int *pcyls, int *pheads, int *psecs)
1624{
1625 *pcyls = bs->cyls;
1626 *pheads = bs->heads;
1627 *psecs = bs->secs;
1628}
1629
0563e191
ZYW
1630/* throttling disk io limits */
1631void bdrv_set_io_limits(BlockDriverState *bs,
1632 BlockIOLimit *io_limits)
1633{
1634 bs->io_limits = *io_limits;
1635 bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
1636}
1637
5bbdbb46
BS
1638/* Recognize floppy formats */
1639typedef struct FDFormat {
1640 FDriveType drive;
1641 uint8_t last_sect;
1642 uint8_t max_track;
1643 uint8_t max_head;
1644} FDFormat;
1645
1646static const FDFormat fd_formats[] = {
1647 /* First entry is default format */
1648 /* 1.44 MB 3"1/2 floppy disks */
1649 { FDRIVE_DRV_144, 18, 80, 1, },
1650 { FDRIVE_DRV_144, 20, 80, 1, },
1651 { FDRIVE_DRV_144, 21, 80, 1, },
1652 { FDRIVE_DRV_144, 21, 82, 1, },
1653 { FDRIVE_DRV_144, 21, 83, 1, },
1654 { FDRIVE_DRV_144, 22, 80, 1, },
1655 { FDRIVE_DRV_144, 23, 80, 1, },
1656 { FDRIVE_DRV_144, 24, 80, 1, },
1657 /* 2.88 MB 3"1/2 floppy disks */
1658 { FDRIVE_DRV_288, 36, 80, 1, },
1659 { FDRIVE_DRV_288, 39, 80, 1, },
1660 { FDRIVE_DRV_288, 40, 80, 1, },
1661 { FDRIVE_DRV_288, 44, 80, 1, },
1662 { FDRIVE_DRV_288, 48, 80, 1, },
1663 /* 720 kB 3"1/2 floppy disks */
1664 { FDRIVE_DRV_144, 9, 80, 1, },
1665 { FDRIVE_DRV_144, 10, 80, 1, },
1666 { FDRIVE_DRV_144, 10, 82, 1, },
1667 { FDRIVE_DRV_144, 10, 83, 1, },
1668 { FDRIVE_DRV_144, 13, 80, 1, },
1669 { FDRIVE_DRV_144, 14, 80, 1, },
1670 /* 1.2 MB 5"1/4 floppy disks */
1671 { FDRIVE_DRV_120, 15, 80, 1, },
1672 { FDRIVE_DRV_120, 18, 80, 1, },
1673 { FDRIVE_DRV_120, 18, 82, 1, },
1674 { FDRIVE_DRV_120, 18, 83, 1, },
1675 { FDRIVE_DRV_120, 20, 80, 1, },
1676 /* 720 kB 5"1/4 floppy disks */
1677 { FDRIVE_DRV_120, 9, 80, 1, },
1678 { FDRIVE_DRV_120, 11, 80, 1, },
1679 /* 360 kB 5"1/4 floppy disks */
1680 { FDRIVE_DRV_120, 9, 40, 1, },
1681 { FDRIVE_DRV_120, 9, 40, 0, },
1682 { FDRIVE_DRV_120, 10, 41, 1, },
1683 { FDRIVE_DRV_120, 10, 42, 1, },
1684 /* 320 kB 5"1/4 floppy disks */
1685 { FDRIVE_DRV_120, 8, 40, 1, },
1686 { FDRIVE_DRV_120, 8, 40, 0, },
1687 /* 360 kB must match 5"1/4 better than 3"1/2... */
1688 { FDRIVE_DRV_144, 9, 80, 0, },
1689 /* end */
1690 { FDRIVE_DRV_NONE, -1, -1, 0, },
1691};
1692
1693void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
1694 int *max_track, int *last_sect,
1695 FDriveType drive_in, FDriveType *drive)
1696{
1697 const FDFormat *parse;
1698 uint64_t nb_sectors, size;
1699 int i, first_match, match;
1700
1701 bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
1702 if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
1703 /* User defined disk */
1704 } else {
1705 bdrv_get_geometry(bs, &nb_sectors);
1706 match = -1;
1707 first_match = -1;
1708 for (i = 0; ; i++) {
1709 parse = &fd_formats[i];
1710 if (parse->drive == FDRIVE_DRV_NONE) {
1711 break;
1712 }
1713 if (drive_in == parse->drive ||
1714 drive_in == FDRIVE_DRV_NONE) {
1715 size = (parse->max_head + 1) * parse->max_track *
1716 parse->last_sect;
1717 if (nb_sectors == size) {
1718 match = i;
1719 break;
1720 }
1721 if (first_match == -1) {
1722 first_match = i;
1723 }
1724 }
1725 }
1726 if (match == -1) {
1727 if (first_match == -1) {
1728 match = 1;
1729 } else {
1730 match = first_match;
1731 }
1732 parse = &fd_formats[match];
1733 }
1734 *nb_heads = parse->max_head + 1;
1735 *max_track = parse->max_track;
1736 *last_sect = parse->last_sect;
1737 *drive = parse->drive;
1738 }
1739}
1740
46d4767d
FB
1741int bdrv_get_translation_hint(BlockDriverState *bs)
1742{
1743 return bs->translation;
1744}
1745
abd7f68d
MA
1746void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
1747 BlockErrorAction on_write_error)
1748{
1749 bs->on_read_error = on_read_error;
1750 bs->on_write_error = on_write_error;
1751}
1752
1753BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
1754{
1755 return is_read ? bs->on_read_error : bs->on_write_error;
1756}
1757
b338082b
FB
1758int bdrv_is_read_only(BlockDriverState *bs)
1759{
1760 return bs->read_only;
1761}
1762
985a03b0
TS
1763int bdrv_is_sg(BlockDriverState *bs)
1764{
1765 return bs->sg;
1766}
1767
e900a7b7
CH
1768int bdrv_enable_write_cache(BlockDriverState *bs)
1769{
1770 return bs->enable_write_cache;
1771}
1772
ea2384d3
FB
1773int bdrv_is_encrypted(BlockDriverState *bs)
1774{
1775 if (bs->backing_hd && bs->backing_hd->encrypted)
1776 return 1;
1777 return bs->encrypted;
1778}
1779
c0f4ce77
AL
1780int bdrv_key_required(BlockDriverState *bs)
1781{
1782 BlockDriverState *backing_hd = bs->backing_hd;
1783
1784 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
1785 return 1;
1786 return (bs->encrypted && !bs->valid_key);
1787}
1788
ea2384d3
FB
1789int bdrv_set_key(BlockDriverState *bs, const char *key)
1790{
1791 int ret;
1792 if (bs->backing_hd && bs->backing_hd->encrypted) {
1793 ret = bdrv_set_key(bs->backing_hd, key);
1794 if (ret < 0)
1795 return ret;
1796 if (!bs->encrypted)
1797 return 0;
1798 }
fd04a2ae
SH
1799 if (!bs->encrypted) {
1800 return -EINVAL;
1801 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
1802 return -ENOMEDIUM;
1803 }
c0f4ce77 1804 ret = bs->drv->bdrv_set_key(bs, key);
bb5fc20f
AL
1805 if (ret < 0) {
1806 bs->valid_key = 0;
1807 } else if (!bs->valid_key) {
1808 bs->valid_key = 1;
1809 /* call the change callback now, we skipped it on open */
7d4b4ba5 1810 bdrv_dev_change_media_cb(bs, true);
bb5fc20f 1811 }
c0f4ce77 1812 return ret;
ea2384d3
FB
1813}
1814
1815void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
1816{
19cb3738 1817 if (!bs->drv) {
ea2384d3
FB
1818 buf[0] = '\0';
1819 } else {
1820 pstrcpy(buf, buf_size, bs->drv->format_name);
1821 }
1822}
1823
5fafdf24 1824void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
ea2384d3
FB
1825 void *opaque)
1826{
1827 BlockDriver *drv;
1828
8a22f02a 1829 QLIST_FOREACH(drv, &bdrv_drivers, list) {
ea2384d3
FB
1830 it(opaque, drv->format_name);
1831 }
1832}
1833
b338082b
FB
1834BlockDriverState *bdrv_find(const char *name)
1835{
1836 BlockDriverState *bs;
1837
1b7bdbc1
SH
1838 QTAILQ_FOREACH(bs, &bdrv_states, list) {
1839 if (!strcmp(name, bs->device_name)) {
b338082b 1840 return bs;
1b7bdbc1 1841 }
b338082b
FB
1842 }
1843 return NULL;
1844}
1845
2f399b0a
MA
1846BlockDriverState *bdrv_next(BlockDriverState *bs)
1847{
1848 if (!bs) {
1849 return QTAILQ_FIRST(&bdrv_states);
1850 }
1851 return QTAILQ_NEXT(bs, list);
1852}
1853
51de9760 1854void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
81d0912d
FB
1855{
1856 BlockDriverState *bs;
1857
1b7bdbc1 1858 QTAILQ_FOREACH(bs, &bdrv_states, list) {
51de9760 1859 it(opaque, bs);
81d0912d
FB
1860 }
1861}
1862
ea2384d3
FB
1863const char *bdrv_get_device_name(BlockDriverState *bs)
1864{
1865 return bs->device_name;
1866}
1867
c6ca28d6
AL
1868void bdrv_flush_all(void)
1869{
1870 BlockDriverState *bs;
1871
1b7bdbc1 1872 QTAILQ_FOREACH(bs, &bdrv_states, list) {
c602a489 1873 if (!bdrv_is_read_only(bs) && bdrv_is_inserted(bs)) {
c6ca28d6 1874 bdrv_flush(bs);
1b7bdbc1
SH
1875 }
1876 }
c6ca28d6
AL
1877}
1878
f2feebbd
KW
1879int bdrv_has_zero_init(BlockDriverState *bs)
1880{
1881 assert(bs->drv);
1882
336c1c12
KW
1883 if (bs->drv->bdrv_has_zero_init) {
1884 return bs->drv->bdrv_has_zero_init(bs);
f2feebbd
KW
1885 }
1886
1887 return 1;
1888}
1889
376ae3f1
SH
1890typedef struct BdrvCoIsAllocatedData {
1891 BlockDriverState *bs;
1892 int64_t sector_num;
1893 int nb_sectors;
1894 int *pnum;
1895 int ret;
1896 bool done;
1897} BdrvCoIsAllocatedData;
1898
1899/* Coroutine wrapper for bdrv_is_allocated() */
1900static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
1901{
1902 BdrvCoIsAllocatedData *data = opaque;
1903 BlockDriverState *bs = data->bs;
1904
1905 data->ret = bs->drv->bdrv_co_is_allocated(bs, data->sector_num,
1906 data->nb_sectors, data->pnum);
1907 data->done = true;
1908}
1909
f58c7b35
TS
1910/*
1911 * Returns true iff the specified sector is present in the disk image. Drivers
1912 * not implementing the functionality are assumed to not support backing files,
1913 * hence all their sectors are reported as allocated.
1914 *
1915 * 'pnum' is set to the number of sectors (including and immediately following
1916 * the specified sector) that are known to be in the same
1917 * allocated/unallocated state.
1918 *
1919 * 'nb_sectors' is the max value 'pnum' should be set to.
1920 */
1921int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
1922 int *pnum)
1923{
6aebab14
SH
1924 if (!bs->drv->bdrv_co_is_allocated) {
1925 int64_t n;
f58c7b35
TS
1926 if (sector_num >= bs->total_sectors) {
1927 *pnum = 0;
1928 return 0;
1929 }
1930 n = bs->total_sectors - sector_num;
1931 *pnum = (n < nb_sectors) ? (n) : (nb_sectors);
1932 return 1;
1933 }
6aebab14
SH
1934
1935 Coroutine *co;
1936 BdrvCoIsAllocatedData data = {
1937 .bs = bs,
1938 .sector_num = sector_num,
1939 .nb_sectors = nb_sectors,
1940 .pnum = pnum,
1941 .done = false,
1942 };
1943
1944 co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
1945 qemu_coroutine_enter(co, &data);
1946 while (!data.done) {
1947 qemu_aio_wait();
1948 }
1949 return data.ret;
f58c7b35
TS
1950}
1951
2582bfed
LC
1952void bdrv_mon_event(const BlockDriverState *bdrv,
1953 BlockMonEventAction action, int is_read)
1954{
1955 QObject *data;
1956 const char *action_str;
1957
1958 switch (action) {
1959 case BDRV_ACTION_REPORT:
1960 action_str = "report";
1961 break;
1962 case BDRV_ACTION_IGNORE:
1963 action_str = "ignore";
1964 break;
1965 case BDRV_ACTION_STOP:
1966 action_str = "stop";
1967 break;
1968 default:
1969 abort();
1970 }
1971
1972 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1973 bdrv->device_name,
1974 action_str,
1975 is_read ? "read" : "write");
1976 monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1977
1978 qobject_decref(data);
1979}
1980
b2023818 1981BlockInfoList *qmp_query_block(Error **errp)
b338082b 1982{
b2023818 1983 BlockInfoList *head = NULL, *cur_item = NULL;
b338082b
FB
1984 BlockDriverState *bs;
1985
1b7bdbc1 1986 QTAILQ_FOREACH(bs, &bdrv_states, list) {
b2023818 1987 BlockInfoList *info = g_malloc0(sizeof(*info));
d15e5465 1988
b2023818
LC
1989 info->value = g_malloc0(sizeof(*info->value));
1990 info->value->device = g_strdup(bs->device_name);
1991 info->value->type = g_strdup("unknown");
1992 info->value->locked = bdrv_dev_is_medium_locked(bs);
1993 info->value->removable = bdrv_dev_has_removable_media(bs);
d15e5465 1994
e4def80b 1995 if (bdrv_dev_has_removable_media(bs)) {
b2023818
LC
1996 info->value->has_tray_open = true;
1997 info->value->tray_open = bdrv_dev_is_tray_open(bs);
e4def80b 1998 }
f04ef601
LC
1999
2000 if (bdrv_iostatus_is_enabled(bs)) {
b2023818
LC
2001 info->value->has_io_status = true;
2002 info->value->io_status = bs->iostatus;
f04ef601
LC
2003 }
2004
19cb3738 2005 if (bs->drv) {
b2023818
LC
2006 info->value->has_inserted = true;
2007 info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2008 info->value->inserted->file = g_strdup(bs->filename);
2009 info->value->inserted->ro = bs->read_only;
2010 info->value->inserted->drv = g_strdup(bs->drv->format_name);
2011 info->value->inserted->encrypted = bs->encrypted;
2012 if (bs->backing_file[0]) {
2013 info->value->inserted->has_backing_file = true;
2014 info->value->inserted->backing_file = g_strdup(bs->backing_file);
376253ec 2015 }
727f005e
ZYW
2016
2017 if (bs->io_limits_enabled) {
2018 info->value->inserted->bps =
2019 bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2020 info->value->inserted->bps_rd =
2021 bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2022 info->value->inserted->bps_wr =
2023 bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2024 info->value->inserted->iops =
2025 bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2026 info->value->inserted->iops_rd =
2027 bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2028 info->value->inserted->iops_wr =
2029 bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2030 }
b2023818 2031 }
d15e5465 2032
b2023818
LC
2033 /* XXX: waiting for the qapi to support GSList */
2034 if (!cur_item) {
2035 head = cur_item = info;
2036 } else {
2037 cur_item->next = info;
2038 cur_item = info;
b338082b 2039 }
b338082b 2040 }
d15e5465 2041
b2023818 2042 return head;
b338082b 2043}
a36e69dd 2044
f11f57e4
LC
2045/* Consider exposing this as a full fledged QMP command */
2046static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2047{
2048 BlockStats *s;
2049
2050 s = g_malloc0(sizeof(*s));
2051
2052 if (bs->device_name[0]) {
2053 s->has_device = true;
2054 s->device = g_strdup(bs->device_name);
294cc35f
KW
2055 }
2056
f11f57e4
LC
2057 s->stats = g_malloc0(sizeof(*s->stats));
2058 s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2059 s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2060 s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2061 s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2062 s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2063 s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2064 s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2065 s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2066 s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2067
294cc35f 2068 if (bs->file) {
f11f57e4
LC
2069 s->has_parent = true;
2070 s->parent = qmp_query_blockstat(bs->file, NULL);
294cc35f
KW
2071 }
2072
f11f57e4 2073 return s;
294cc35f
KW
2074}
2075
f11f57e4 2076BlockStatsList *qmp_query_blockstats(Error **errp)
218a536a 2077{
f11f57e4 2078 BlockStatsList *head = NULL, *cur_item = NULL;
a36e69dd
TS
2079 BlockDriverState *bs;
2080
1b7bdbc1 2081 QTAILQ_FOREACH(bs, &bdrv_states, list) {
f11f57e4
LC
2082 BlockStatsList *info = g_malloc0(sizeof(*info));
2083 info->value = qmp_query_blockstat(bs, NULL);
2084
2085 /* XXX: waiting for the qapi to support GSList */
2086 if (!cur_item) {
2087 head = cur_item = info;
2088 } else {
2089 cur_item->next = info;
2090 cur_item = info;
2091 }
a36e69dd 2092 }
218a536a 2093
f11f57e4 2094 return head;
a36e69dd 2095}
ea2384d3 2096
045df330
AL
2097const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2098{
2099 if (bs->backing_hd && bs->backing_hd->encrypted)
2100 return bs->backing_file;
2101 else if (bs->encrypted)
2102 return bs->filename;
2103 else
2104 return NULL;
2105}
2106
5fafdf24 2107void bdrv_get_backing_filename(BlockDriverState *bs,
83f64091
FB
2108 char *filename, int filename_size)
2109{
3574c608 2110 pstrcpy(filename, filename_size, bs->backing_file);
83f64091
FB
2111}
2112
5fafdf24 2113int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
faea38e7
FB
2114 const uint8_t *buf, int nb_sectors)
2115{
2116 BlockDriver *drv = bs->drv;
2117 if (!drv)
19cb3738 2118 return -ENOMEDIUM;
faea38e7
FB
2119 if (!drv->bdrv_write_compressed)
2120 return -ENOTSUP;
fbb7b4e0
KW
2121 if (bdrv_check_request(bs, sector_num, nb_sectors))
2122 return -EIO;
a55eb92c 2123
c6d22830 2124 if (bs->dirty_bitmap) {
7cd1e32a
LS
2125 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2126 }
a55eb92c 2127
faea38e7
FB
2128 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2129}
3b46e624 2130
faea38e7
FB
2131int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2132{
2133 BlockDriver *drv = bs->drv;
2134 if (!drv)
19cb3738 2135 return -ENOMEDIUM;
faea38e7
FB
2136 if (!drv->bdrv_get_info)
2137 return -ENOTSUP;
2138 memset(bdi, 0, sizeof(*bdi));
2139 return drv->bdrv_get_info(bs, bdi);
2140}
2141
45566e9c
CH
2142int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2143 int64_t pos, int size)
178e08a5
AL
2144{
2145 BlockDriver *drv = bs->drv;
2146 if (!drv)
2147 return -ENOMEDIUM;
7cdb1f6d
MK
2148 if (drv->bdrv_save_vmstate)
2149 return drv->bdrv_save_vmstate(bs, buf, pos, size);
2150 if (bs->file)
2151 return bdrv_save_vmstate(bs->file, buf, pos, size);
2152 return -ENOTSUP;
178e08a5
AL
2153}
2154
45566e9c
CH
2155int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2156 int64_t pos, int size)
178e08a5
AL
2157{
2158 BlockDriver *drv = bs->drv;
2159 if (!drv)
2160 return -ENOMEDIUM;
7cdb1f6d
MK
2161 if (drv->bdrv_load_vmstate)
2162 return drv->bdrv_load_vmstate(bs, buf, pos, size);
2163 if (bs->file)
2164 return bdrv_load_vmstate(bs->file, buf, pos, size);
2165 return -ENOTSUP;
178e08a5
AL
2166}
2167
8b9b0cc2
KW
2168void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2169{
2170 BlockDriver *drv = bs->drv;
2171
2172 if (!drv || !drv->bdrv_debug_event) {
2173 return;
2174 }
2175
2176 return drv->bdrv_debug_event(bs, event);
2177
2178}
2179
faea38e7
FB
2180/**************************************************************/
2181/* handling of snapshots */
2182
feeee5ac
MDCF
2183int bdrv_can_snapshot(BlockDriverState *bs)
2184{
2185 BlockDriver *drv = bs->drv;
07b70bfb 2186 if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
feeee5ac
MDCF
2187 return 0;
2188 }
2189
2190 if (!drv->bdrv_snapshot_create) {
2191 if (bs->file != NULL) {
2192 return bdrv_can_snapshot(bs->file);
2193 }
2194 return 0;
2195 }
2196
2197 return 1;
2198}
2199
199630b6
BS
2200int bdrv_is_snapshot(BlockDriverState *bs)
2201{
2202 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2203}
2204
f9092b10
MA
2205BlockDriverState *bdrv_snapshots(void)
2206{
2207 BlockDriverState *bs;
2208
3ac906f7 2209 if (bs_snapshots) {
f9092b10 2210 return bs_snapshots;
3ac906f7 2211 }
f9092b10
MA
2212
2213 bs = NULL;
2214 while ((bs = bdrv_next(bs))) {
2215 if (bdrv_can_snapshot(bs)) {
3ac906f7
MA
2216 bs_snapshots = bs;
2217 return bs;
f9092b10
MA
2218 }
2219 }
2220 return NULL;
f9092b10
MA
2221}
2222
5fafdf24 2223int bdrv_snapshot_create(BlockDriverState *bs,
faea38e7
FB
2224 QEMUSnapshotInfo *sn_info)
2225{
2226 BlockDriver *drv = bs->drv;
2227 if (!drv)
19cb3738 2228 return -ENOMEDIUM;
7cdb1f6d
MK
2229 if (drv->bdrv_snapshot_create)
2230 return drv->bdrv_snapshot_create(bs, sn_info);
2231 if (bs->file)
2232 return bdrv_snapshot_create(bs->file, sn_info);
2233 return -ENOTSUP;
faea38e7
FB
2234}
2235
5fafdf24 2236int bdrv_snapshot_goto(BlockDriverState *bs,
faea38e7
FB
2237 const char *snapshot_id)
2238{
2239 BlockDriver *drv = bs->drv;
7cdb1f6d
MK
2240 int ret, open_ret;
2241
faea38e7 2242 if (!drv)
19cb3738 2243 return -ENOMEDIUM;
7cdb1f6d
MK
2244 if (drv->bdrv_snapshot_goto)
2245 return drv->bdrv_snapshot_goto(bs, snapshot_id);
2246
2247 if (bs->file) {
2248 drv->bdrv_close(bs);
2249 ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2250 open_ret = drv->bdrv_open(bs, bs->open_flags);
2251 if (open_ret < 0) {
2252 bdrv_delete(bs->file);
2253 bs->drv = NULL;
2254 return open_ret;
2255 }
2256 return ret;
2257 }
2258
2259 return -ENOTSUP;
faea38e7
FB
2260}
2261
2262int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2263{
2264 BlockDriver *drv = bs->drv;
2265 if (!drv)
19cb3738 2266 return -ENOMEDIUM;
7cdb1f6d
MK
2267 if (drv->bdrv_snapshot_delete)
2268 return drv->bdrv_snapshot_delete(bs, snapshot_id);
2269 if (bs->file)
2270 return bdrv_snapshot_delete(bs->file, snapshot_id);
2271 return -ENOTSUP;
faea38e7
FB
2272}
2273
5fafdf24 2274int bdrv_snapshot_list(BlockDriverState *bs,
faea38e7
FB
2275 QEMUSnapshotInfo **psn_info)
2276{
2277 BlockDriver *drv = bs->drv;
2278 if (!drv)
19cb3738 2279 return -ENOMEDIUM;
7cdb1f6d
MK
2280 if (drv->bdrv_snapshot_list)
2281 return drv->bdrv_snapshot_list(bs, psn_info);
2282 if (bs->file)
2283 return bdrv_snapshot_list(bs->file, psn_info);
2284 return -ENOTSUP;
faea38e7
FB
2285}
2286
51ef6727 2287int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2288 const char *snapshot_name)
2289{
2290 BlockDriver *drv = bs->drv;
2291 if (!drv) {
2292 return -ENOMEDIUM;
2293 }
2294 if (!bs->read_only) {
2295 return -EINVAL;
2296 }
2297 if (drv->bdrv_snapshot_load_tmp) {
2298 return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2299 }
2300 return -ENOTSUP;
2301}
2302
faea38e7
FB
2303#define NB_SUFFIXES 4
2304
2305char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2306{
2307 static const char suffixes[NB_SUFFIXES] = "KMGT";
2308 int64_t base;
2309 int i;
2310
2311 if (size <= 999) {
2312 snprintf(buf, buf_size, "%" PRId64, size);
2313 } else {
2314 base = 1024;
2315 for(i = 0; i < NB_SUFFIXES; i++) {
2316 if (size < (10 * base)) {
5fafdf24 2317 snprintf(buf, buf_size, "%0.1f%c",
faea38e7
FB
2318 (double)size / base,
2319 suffixes[i]);
2320 break;
2321 } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
5fafdf24 2322 snprintf(buf, buf_size, "%" PRId64 "%c",
faea38e7
FB
2323 ((size + (base >> 1)) / base),
2324 suffixes[i]);
2325 break;
2326 }
2327 base = base * 1024;
2328 }
2329 }
2330 return buf;
2331}
2332
2333char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2334{
2335 char buf1[128], date_buf[128], clock_buf[128];
3b9f94e1
FB
2336#ifdef _WIN32
2337 struct tm *ptm;
2338#else
faea38e7 2339 struct tm tm;
3b9f94e1 2340#endif
faea38e7
FB
2341 time_t ti;
2342 int64_t secs;
2343
2344 if (!sn) {
5fafdf24
TS
2345 snprintf(buf, buf_size,
2346 "%-10s%-20s%7s%20s%15s",
faea38e7
FB
2347 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2348 } else {
2349 ti = sn->date_sec;
3b9f94e1
FB
2350#ifdef _WIN32
2351 ptm = localtime(&ti);
2352 strftime(date_buf, sizeof(date_buf),
2353 "%Y-%m-%d %H:%M:%S", ptm);
2354#else
faea38e7
FB
2355 localtime_r(&ti, &tm);
2356 strftime(date_buf, sizeof(date_buf),
2357 "%Y-%m-%d %H:%M:%S", &tm);
3b9f94e1 2358#endif
faea38e7
FB
2359 secs = sn->vm_clock_nsec / 1000000000;
2360 snprintf(clock_buf, sizeof(clock_buf),
2361 "%02d:%02d:%02d.%03d",
2362 (int)(secs / 3600),
2363 (int)((secs / 60) % 60),
5fafdf24 2364 (int)(secs % 60),
faea38e7
FB
2365 (int)((sn->vm_clock_nsec / 1000000) % 1000));
2366 snprintf(buf, buf_size,
5fafdf24 2367 "%-10s%-20s%7s%20s%15s",
faea38e7
FB
2368 sn->id_str, sn->name,
2369 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2370 date_buf,
2371 clock_buf);
2372 }
2373 return buf;
2374}
2375
ea2384d3 2376/**************************************************************/
83f64091 2377/* async I/Os */
ea2384d3 2378
3b69e4b9 2379BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
f141eafe 2380 QEMUIOVector *qiov, int nb_sectors,
3b69e4b9 2381 BlockDriverCompletionFunc *cb, void *opaque)
83f64091 2382{
bbf0a440
SH
2383 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2384
b2a61371 2385 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
8c5873d6 2386 cb, opaque, false);
ea2384d3
FB
2387}
2388
f141eafe
AL
2389BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2390 QEMUIOVector *qiov, int nb_sectors,
2391 BlockDriverCompletionFunc *cb, void *opaque)
ea2384d3 2392{
bbf0a440
SH
2393 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2394
1a6e115b 2395 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
8c5873d6 2396 cb, opaque, true);
83f64091
FB
2397}
2398
40b4f539
KW
2399
2400typedef struct MultiwriteCB {
2401 int error;
2402 int num_requests;
2403 int num_callbacks;
2404 struct {
2405 BlockDriverCompletionFunc *cb;
2406 void *opaque;
2407 QEMUIOVector *free_qiov;
2408 void *free_buf;
2409 } callbacks[];
2410} MultiwriteCB;
2411
2412static void multiwrite_user_cb(MultiwriteCB *mcb)
2413{
2414 int i;
2415
2416 for (i = 0; i < mcb->num_callbacks; i++) {
2417 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
1e1ea48d
SH
2418 if (mcb->callbacks[i].free_qiov) {
2419 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2420 }
7267c094 2421 g_free(mcb->callbacks[i].free_qiov);
f8a83245 2422 qemu_vfree(mcb->callbacks[i].free_buf);
40b4f539
KW
2423 }
2424}
2425
2426static void multiwrite_cb(void *opaque, int ret)
2427{
2428 MultiwriteCB *mcb = opaque;
2429
6d519a5f
SH
2430 trace_multiwrite_cb(mcb, ret);
2431
cb6d3ca0 2432 if (ret < 0 && !mcb->error) {
40b4f539 2433 mcb->error = ret;
40b4f539
KW
2434 }
2435
2436 mcb->num_requests--;
2437 if (mcb->num_requests == 0) {
de189a1b 2438 multiwrite_user_cb(mcb);
7267c094 2439 g_free(mcb);
40b4f539
KW
2440 }
2441}
2442
2443static int multiwrite_req_compare(const void *a, const void *b)
2444{
77be4366
CH
2445 const BlockRequest *req1 = a, *req2 = b;
2446
2447 /*
2448 * Note that we can't simply subtract req2->sector from req1->sector
2449 * here as that could overflow the return value.
2450 */
2451 if (req1->sector > req2->sector) {
2452 return 1;
2453 } else if (req1->sector < req2->sector) {
2454 return -1;
2455 } else {
2456 return 0;
2457 }
40b4f539
KW
2458}
2459
2460/*
2461 * Takes a bunch of requests and tries to merge them. Returns the number of
2462 * requests that remain after merging.
2463 */
2464static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
2465 int num_reqs, MultiwriteCB *mcb)
2466{
2467 int i, outidx;
2468
2469 // Sort requests by start sector
2470 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
2471
2472 // Check if adjacent requests touch the same clusters. If so, combine them,
2473 // filling up gaps with zero sectors.
2474 outidx = 0;
2475 for (i = 1; i < num_reqs; i++) {
2476 int merge = 0;
2477 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
2478
2479 // This handles the cases that are valid for all block drivers, namely
2480 // exactly sequential writes and overlapping writes.
2481 if (reqs[i].sector <= oldreq_last) {
2482 merge = 1;
2483 }
2484
2485 // The block driver may decide that it makes sense to combine requests
2486 // even if there is a gap of some sectors between them. In this case,
2487 // the gap is filled with zeros (therefore only applicable for yet
2488 // unused space in format like qcow2).
2489 if (!merge && bs->drv->bdrv_merge_requests) {
2490 merge = bs->drv->bdrv_merge_requests(bs, &reqs[outidx], &reqs[i]);
2491 }
2492
e2a305fb
CH
2493 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
2494 merge = 0;
2495 }
2496
40b4f539
KW
2497 if (merge) {
2498 size_t size;
7267c094 2499 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
40b4f539
KW
2500 qemu_iovec_init(qiov,
2501 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
2502
2503 // Add the first request to the merged one. If the requests are
2504 // overlapping, drop the last sectors of the first request.
2505 size = (reqs[i].sector - reqs[outidx].sector) << 9;
2506 qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
2507
2508 // We might need to add some zeros between the two requests
2509 if (reqs[i].sector > oldreq_last) {
2510 size_t zero_bytes = (reqs[i].sector - oldreq_last) << 9;
2511 uint8_t *buf = qemu_blockalign(bs, zero_bytes);
2512 memset(buf, 0, zero_bytes);
2513 qemu_iovec_add(qiov, buf, zero_bytes);
2514 mcb->callbacks[i].free_buf = buf;
2515 }
2516
2517 // Add the second request
2518 qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
2519
cbf1dff2 2520 reqs[outidx].nb_sectors = qiov->size >> 9;
40b4f539
KW
2521 reqs[outidx].qiov = qiov;
2522
2523 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
2524 } else {
2525 outidx++;
2526 reqs[outidx].sector = reqs[i].sector;
2527 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
2528 reqs[outidx].qiov = reqs[i].qiov;
2529 }
2530 }
2531
2532 return outidx + 1;
2533}
2534
2535/*
2536 * Submit multiple AIO write requests at once.
2537 *
2538 * On success, the function returns 0 and all requests in the reqs array have
2539 * been submitted. In error case this function returns -1, and any of the
2540 * requests may or may not be submitted yet. In particular, this means that the
2541 * callback will be called for some of the requests, for others it won't. The
2542 * caller must check the error field of the BlockRequest to wait for the right
2543 * callbacks (if error != 0, no callback will be called).
2544 *
2545 * The implementation may modify the contents of the reqs array, e.g. to merge
2546 * requests. However, the fields opaque and error are left unmodified as they
2547 * are used to signal failure for a single request to the caller.
2548 */
2549int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
2550{
2551 BlockDriverAIOCB *acb;
2552 MultiwriteCB *mcb;
2553 int i;
2554
301db7c2
RH
2555 /* don't submit writes if we don't have a medium */
2556 if (bs->drv == NULL) {
2557 for (i = 0; i < num_reqs; i++) {
2558 reqs[i].error = -ENOMEDIUM;
2559 }
2560 return -1;
2561 }
2562
40b4f539
KW
2563 if (num_reqs == 0) {
2564 return 0;
2565 }
2566
2567 // Create MultiwriteCB structure
7267c094 2568 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
40b4f539
KW
2569 mcb->num_requests = 0;
2570 mcb->num_callbacks = num_reqs;
2571
2572 for (i = 0; i < num_reqs; i++) {
2573 mcb->callbacks[i].cb = reqs[i].cb;
2574 mcb->callbacks[i].opaque = reqs[i].opaque;
2575 }
2576
2577 // Check for mergable requests
2578 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
2579
6d519a5f
SH
2580 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
2581
453f9a16
KW
2582 /*
2583 * Run the aio requests. As soon as one request can't be submitted
2584 * successfully, fail all requests that are not yet submitted (we must
2585 * return failure for all requests anyway)
2586 *
2587 * num_requests cannot be set to the right value immediately: If
2588 * bdrv_aio_writev fails for some request, num_requests would be too high
2589 * and therefore multiwrite_cb() would never recognize the multiwrite
2590 * request as completed. We also cannot use the loop variable i to set it
2591 * when the first request fails because the callback may already have been
2592 * called for previously submitted requests. Thus, num_requests must be
2593 * incremented for each request that is submitted.
2594 *
2595 * The problem that callbacks may be called early also means that we need
2596 * to take care that num_requests doesn't become 0 before all requests are
2597 * submitted - multiwrite_cb() would consider the multiwrite request
2598 * completed. A dummy request that is "completed" by a manual call to
2599 * multiwrite_cb() takes care of this.
2600 */
2601 mcb->num_requests = 1;
2602
6d519a5f 2603 // Run the aio requests
40b4f539 2604 for (i = 0; i < num_reqs; i++) {
453f9a16 2605 mcb->num_requests++;
40b4f539
KW
2606 acb = bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
2607 reqs[i].nb_sectors, multiwrite_cb, mcb);
2608
2609 if (acb == NULL) {
2610 // We can only fail the whole thing if no request has been
2611 // submitted yet. Otherwise we'll wait for the submitted AIOs to
2612 // complete and report the error in the callback.
453f9a16 2613 if (i == 0) {
6d519a5f 2614 trace_bdrv_aio_multiwrite_earlyfail(mcb);
40b4f539
KW
2615 goto fail;
2616 } else {
6d519a5f 2617 trace_bdrv_aio_multiwrite_latefail(mcb, i);
7eb58a6c 2618 multiwrite_cb(mcb, -EIO);
40b4f539
KW
2619 break;
2620 }
40b4f539
KW
2621 }
2622 }
2623
453f9a16
KW
2624 /* Complete the dummy request */
2625 multiwrite_cb(mcb, 0);
2626
40b4f539
KW
2627 return 0;
2628
2629fail:
453f9a16
KW
2630 for (i = 0; i < mcb->num_callbacks; i++) {
2631 reqs[i].error = -EIO;
2632 }
7267c094 2633 g_free(mcb);
40b4f539
KW
2634 return -1;
2635}
2636
83f64091 2637void bdrv_aio_cancel(BlockDriverAIOCB *acb)
83f64091 2638{
6bbff9a0 2639 acb->pool->cancel(acb);
83f64091
FB
2640}
2641
98f90dba
ZYW
2642/* block I/O throttling */
2643static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
2644 bool is_write, double elapsed_time, uint64_t *wait)
2645{
2646 uint64_t bps_limit = 0;
2647 double bytes_limit, bytes_base, bytes_res;
2648 double slice_time, wait_time;
2649
2650 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
2651 bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2652 } else if (bs->io_limits.bps[is_write]) {
2653 bps_limit = bs->io_limits.bps[is_write];
2654 } else {
2655 if (wait) {
2656 *wait = 0;
2657 }
2658
2659 return false;
2660 }
2661
2662 slice_time = bs->slice_end - bs->slice_start;
2663 slice_time /= (NANOSECONDS_PER_SECOND);
2664 bytes_limit = bps_limit * slice_time;
2665 bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
2666 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
2667 bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
2668 }
2669
2670 /* bytes_base: the bytes of data which have been read/written; and
2671 * it is obtained from the history statistic info.
2672 * bytes_res: the remaining bytes of data which need to be read/written.
2673 * (bytes_base + bytes_res) / bps_limit: used to calcuate
2674 * the total time for completing reading/writting all data.
2675 */
2676 bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
2677
2678 if (bytes_base + bytes_res <= bytes_limit) {
2679 if (wait) {
2680 *wait = 0;
2681 }
2682
2683 return false;
2684 }
2685
2686 /* Calc approx time to dispatch */
2687 wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
2688
2689 /* When the I/O rate at runtime exceeds the limits,
2690 * bs->slice_end need to be extended in order that the current statistic
2691 * info can be kept until the timer fire, so it is increased and tuned
2692 * based on the result of experiment.
2693 */
2694 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
2695 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
2696 if (wait) {
2697 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
2698 }
2699
2700 return true;
2701}
2702
2703static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
2704 double elapsed_time, uint64_t *wait)
2705{
2706 uint64_t iops_limit = 0;
2707 double ios_limit, ios_base;
2708 double slice_time, wait_time;
2709
2710 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
2711 iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2712 } else if (bs->io_limits.iops[is_write]) {
2713 iops_limit = bs->io_limits.iops[is_write];
2714 } else {
2715 if (wait) {
2716 *wait = 0;
2717 }
2718
2719 return false;
2720 }
2721
2722 slice_time = bs->slice_end - bs->slice_start;
2723 slice_time /= (NANOSECONDS_PER_SECOND);
2724 ios_limit = iops_limit * slice_time;
2725 ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
2726 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
2727 ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
2728 }
2729
2730 if (ios_base + 1 <= ios_limit) {
2731 if (wait) {
2732 *wait = 0;
2733 }
2734
2735 return false;
2736 }
2737
2738 /* Calc approx time to dispatch */
2739 wait_time = (ios_base + 1) / iops_limit;
2740 if (wait_time > elapsed_time) {
2741 wait_time = wait_time - elapsed_time;
2742 } else {
2743 wait_time = 0;
2744 }
2745
2746 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
2747 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
2748 if (wait) {
2749 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
2750 }
2751
2752 return true;
2753}
2754
2755static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
2756 bool is_write, int64_t *wait)
2757{
2758 int64_t now, max_wait;
2759 uint64_t bps_wait = 0, iops_wait = 0;
2760 double elapsed_time;
2761 int bps_ret, iops_ret;
2762
2763 now = qemu_get_clock_ns(vm_clock);
2764 if ((bs->slice_start < now)
2765 && (bs->slice_end > now)) {
2766 bs->slice_end = now + bs->slice_time;
2767 } else {
2768 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
2769 bs->slice_start = now;
2770 bs->slice_end = now + bs->slice_time;
2771
2772 bs->io_base.bytes[is_write] = bs->nr_bytes[is_write];
2773 bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
2774
2775 bs->io_base.ios[is_write] = bs->nr_ops[is_write];
2776 bs->io_base.ios[!is_write] = bs->nr_ops[!is_write];
2777 }
2778
2779 elapsed_time = now - bs->slice_start;
2780 elapsed_time /= (NANOSECONDS_PER_SECOND);
2781
2782 bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors,
2783 is_write, elapsed_time, &bps_wait);
2784 iops_ret = bdrv_exceed_iops_limits(bs, is_write,
2785 elapsed_time, &iops_wait);
2786 if (bps_ret || iops_ret) {
2787 max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
2788 if (wait) {
2789 *wait = max_wait;
2790 }
2791
2792 now = qemu_get_clock_ns(vm_clock);
2793 if (bs->slice_end < now + max_wait) {
2794 bs->slice_end = now + max_wait;
2795 }
2796
2797 return true;
2798 }
2799
2800 if (wait) {
2801 *wait = 0;
2802 }
2803
2804 return false;
2805}
ce1a14dc 2806
83f64091
FB
2807/**************************************************************/
2808/* async block device emulation */
2809
c16b5a2c
CH
2810typedef struct BlockDriverAIOCBSync {
2811 BlockDriverAIOCB common;
2812 QEMUBH *bh;
2813 int ret;
2814 /* vector translation state */
2815 QEMUIOVector *qiov;
2816 uint8_t *bounce;
2817 int is_write;
2818} BlockDriverAIOCBSync;
2819
2820static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
2821{
b666d239
KW
2822 BlockDriverAIOCBSync *acb =
2823 container_of(blockacb, BlockDriverAIOCBSync, common);
6a7ad299 2824 qemu_bh_delete(acb->bh);
36afc451 2825 acb->bh = NULL;
c16b5a2c
CH
2826 qemu_aio_release(acb);
2827}
2828
2829static AIOPool bdrv_em_aio_pool = {
2830 .aiocb_size = sizeof(BlockDriverAIOCBSync),
2831 .cancel = bdrv_aio_cancel_em,
2832};
2833
ce1a14dc 2834static void bdrv_aio_bh_cb(void *opaque)
83f64091 2835{
ce1a14dc 2836 BlockDriverAIOCBSync *acb = opaque;
f141eafe 2837
f141eafe
AL
2838 if (!acb->is_write)
2839 qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
ceb42de8 2840 qemu_vfree(acb->bounce);
ce1a14dc 2841 acb->common.cb(acb->common.opaque, acb->ret);
6a7ad299 2842 qemu_bh_delete(acb->bh);
36afc451 2843 acb->bh = NULL;
ce1a14dc 2844 qemu_aio_release(acb);
83f64091 2845}
beac80cd 2846
f141eafe
AL
2847static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
2848 int64_t sector_num,
2849 QEMUIOVector *qiov,
2850 int nb_sectors,
2851 BlockDriverCompletionFunc *cb,
2852 void *opaque,
2853 int is_write)
2854
83f64091 2855{
ce1a14dc 2856 BlockDriverAIOCBSync *acb;
ce1a14dc 2857
c16b5a2c 2858 acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
f141eafe
AL
2859 acb->is_write = is_write;
2860 acb->qiov = qiov;
e268ca52 2861 acb->bounce = qemu_blockalign(bs, qiov->size);
f141eafe 2862
ce1a14dc
PB
2863 if (!acb->bh)
2864 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
f141eafe
AL
2865
2866 if (is_write) {
2867 qemu_iovec_to_buffer(acb->qiov, acb->bounce);
1ed20acf 2868 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
f141eafe 2869 } else {
1ed20acf 2870 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
f141eafe
AL
2871 }
2872
ce1a14dc 2873 qemu_bh_schedule(acb->bh);
f141eafe 2874
ce1a14dc 2875 return &acb->common;
beac80cd
FB
2876}
2877
f141eafe
AL
2878static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
2879 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 2880 BlockDriverCompletionFunc *cb, void *opaque)
beac80cd 2881{
f141eafe
AL
2882 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
2883}
83f64091 2884
f141eafe
AL
2885static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
2886 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2887 BlockDriverCompletionFunc *cb, void *opaque)
2888{
2889 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
beac80cd 2890}
beac80cd 2891
68485420
KW
2892
2893typedef struct BlockDriverAIOCBCoroutine {
2894 BlockDriverAIOCB common;
2895 BlockRequest req;
2896 bool is_write;
2897 QEMUBH* bh;
2898} BlockDriverAIOCBCoroutine;
2899
2900static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
2901{
2902 qemu_aio_flush();
2903}
2904
2905static AIOPool bdrv_em_co_aio_pool = {
2906 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
2907 .cancel = bdrv_aio_co_cancel_em,
2908};
2909
35246a68 2910static void bdrv_co_em_bh(void *opaque)
68485420
KW
2911{
2912 BlockDriverAIOCBCoroutine *acb = opaque;
2913
2914 acb->common.cb(acb->common.opaque, acb->req.error);
2915 qemu_bh_delete(acb->bh);
2916 qemu_aio_release(acb);
2917}
2918
b2a61371
SH
2919/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
2920static void coroutine_fn bdrv_co_do_rw(void *opaque)
2921{
2922 BlockDriverAIOCBCoroutine *acb = opaque;
2923 BlockDriverState *bs = acb->common.bs;
2924
2925 if (!acb->is_write) {
2926 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
2927 acb->req.nb_sectors, acb->req.qiov);
2928 } else {
2929 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
2930 acb->req.nb_sectors, acb->req.qiov);
2931 }
2932
35246a68 2933 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
b2a61371
SH
2934 qemu_bh_schedule(acb->bh);
2935}
2936
68485420
KW
2937static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
2938 int64_t sector_num,
2939 QEMUIOVector *qiov,
2940 int nb_sectors,
2941 BlockDriverCompletionFunc *cb,
2942 void *opaque,
8c5873d6 2943 bool is_write)
68485420
KW
2944{
2945 Coroutine *co;
2946 BlockDriverAIOCBCoroutine *acb;
2947
2948 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
2949 acb->req.sector = sector_num;
2950 acb->req.nb_sectors = nb_sectors;
2951 acb->req.qiov = qiov;
2952 acb->is_write = is_write;
2953
8c5873d6 2954 co = qemu_coroutine_create(bdrv_co_do_rw);
68485420
KW
2955 qemu_coroutine_enter(co, acb);
2956
2957 return &acb->common;
2958}
2959
07f07615 2960static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
b2e12bc6 2961{
07f07615
PB
2962 BlockDriverAIOCBCoroutine *acb = opaque;
2963 BlockDriverState *bs = acb->common.bs;
b2e12bc6 2964
07f07615
PB
2965 acb->req.error = bdrv_co_flush(bs);
2966 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
b2e12bc6 2967 qemu_bh_schedule(acb->bh);
b2e12bc6
CH
2968}
2969
07f07615 2970BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
016f5cf6
AG
2971 BlockDriverCompletionFunc *cb, void *opaque)
2972{
07f07615 2973 trace_bdrv_aio_flush(bs, opaque);
016f5cf6 2974
07f07615
PB
2975 Coroutine *co;
2976 BlockDriverAIOCBCoroutine *acb;
016f5cf6 2977
07f07615
PB
2978 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
2979 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
2980 qemu_coroutine_enter(co, acb);
016f5cf6 2981
016f5cf6
AG
2982 return &acb->common;
2983}
2984
4265d620
PB
2985static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
2986{
2987 BlockDriverAIOCBCoroutine *acb = opaque;
2988 BlockDriverState *bs = acb->common.bs;
2989
2990 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
2991 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
2992 qemu_bh_schedule(acb->bh);
2993}
2994
2995BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
2996 int64_t sector_num, int nb_sectors,
2997 BlockDriverCompletionFunc *cb, void *opaque)
2998{
2999 Coroutine *co;
3000 BlockDriverAIOCBCoroutine *acb;
3001
3002 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3003
3004 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3005 acb->req.sector = sector_num;
3006 acb->req.nb_sectors = nb_sectors;
3007 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3008 qemu_coroutine_enter(co, acb);
3009
3010 return &acb->common;
3011}
3012
ea2384d3
FB
3013void bdrv_init(void)
3014{
5efa9d5a 3015 module_call_init(MODULE_INIT_BLOCK);
ea2384d3 3016}
ce1a14dc 3017
eb852011
MA
3018void bdrv_init_with_whitelist(void)
3019{
3020 use_bdrv_whitelist = 1;
3021 bdrv_init();
3022}
3023
c16b5a2c
CH
3024void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3025 BlockDriverCompletionFunc *cb, void *opaque)
ce1a14dc 3026{
ce1a14dc
PB
3027 BlockDriverAIOCB *acb;
3028
6bbff9a0
AL
3029 if (pool->free_aiocb) {
3030 acb = pool->free_aiocb;
3031 pool->free_aiocb = acb->next;
ce1a14dc 3032 } else {
7267c094 3033 acb = g_malloc0(pool->aiocb_size);
6bbff9a0 3034 acb->pool = pool;
ce1a14dc
PB
3035 }
3036 acb->bs = bs;
3037 acb->cb = cb;
3038 acb->opaque = opaque;
3039 return acb;
3040}
3041
3042void qemu_aio_release(void *p)
3043{
6bbff9a0
AL
3044 BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3045 AIOPool *pool = acb->pool;
3046 acb->next = pool->free_aiocb;
3047 pool->free_aiocb = acb;
ce1a14dc 3048}
19cb3738 3049
f9f05dc5
KW
3050/**************************************************************/
3051/* Coroutine block device emulation */
3052
3053typedef struct CoroutineIOCompletion {
3054 Coroutine *coroutine;
3055 int ret;
3056} CoroutineIOCompletion;
3057
3058static void bdrv_co_io_em_complete(void *opaque, int ret)
3059{
3060 CoroutineIOCompletion *co = opaque;
3061
3062 co->ret = ret;
3063 qemu_coroutine_enter(co->coroutine, NULL);
3064}
3065
3066static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3067 int nb_sectors, QEMUIOVector *iov,
3068 bool is_write)
3069{
3070 CoroutineIOCompletion co = {
3071 .coroutine = qemu_coroutine_self(),
3072 };
3073 BlockDriverAIOCB *acb;
3074
3075 if (is_write) {
a652d160
SH
3076 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3077 bdrv_co_io_em_complete, &co);
f9f05dc5 3078 } else {
a652d160
SH
3079 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3080 bdrv_co_io_em_complete, &co);
f9f05dc5
KW
3081 }
3082
59370aaa 3083 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
f9f05dc5
KW
3084 if (!acb) {
3085 return -EIO;
3086 }
3087 qemu_coroutine_yield();
3088
3089 return co.ret;
3090}
3091
3092static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3093 int64_t sector_num, int nb_sectors,
3094 QEMUIOVector *iov)
3095{
3096 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3097}
3098
3099static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3100 int64_t sector_num, int nb_sectors,
3101 QEMUIOVector *iov)
3102{
3103 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3104}
3105
07f07615 3106static void coroutine_fn bdrv_flush_co_entry(void *opaque)
e7a8a783 3107{
07f07615
PB
3108 RwCo *rwco = opaque;
3109
3110 rwco->ret = bdrv_co_flush(rwco->bs);
3111}
3112
3113int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3114{
eb489bb1
KW
3115 int ret;
3116
ca716364 3117 if (!bs->drv) {
07f07615 3118 return 0;
eb489bb1
KW
3119 }
3120
ca716364 3121 /* Write back cached data to the OS even with cache=unsafe */
eb489bb1
KW
3122 if (bs->drv->bdrv_co_flush_to_os) {
3123 ret = bs->drv->bdrv_co_flush_to_os(bs);
3124 if (ret < 0) {
3125 return ret;
3126 }
3127 }
3128
ca716364
KW
3129 /* But don't actually force it to the disk with cache=unsafe */
3130 if (bs->open_flags & BDRV_O_NO_FLUSH) {
3131 return 0;
3132 }
3133
eb489bb1 3134 if (bs->drv->bdrv_co_flush_to_disk) {
c68b89ac 3135 return bs->drv->bdrv_co_flush_to_disk(bs);
07f07615
PB
3136 } else if (bs->drv->bdrv_aio_flush) {
3137 BlockDriverAIOCB *acb;
3138 CoroutineIOCompletion co = {
3139 .coroutine = qemu_coroutine_self(),
3140 };
3141
3142 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3143 if (acb == NULL) {
3144 return -EIO;
3145 } else {
3146 qemu_coroutine_yield();
3147 return co.ret;
3148 }
07f07615
PB
3149 } else {
3150 /*
3151 * Some block drivers always operate in either writethrough or unsafe
3152 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3153 * know how the server works (because the behaviour is hardcoded or
3154 * depends on server-side configuration), so we can't ensure that
3155 * everything is safe on disk. Returning an error doesn't work because
3156 * that would break guests even if the server operates in writethrough
3157 * mode.
3158 *
3159 * Let's hope the user knows what he's doing.
3160 */
3161 return 0;
3162 }
3163}
3164
0f15423c
AL
3165void bdrv_invalidate_cache(BlockDriverState *bs)
3166{
3167 if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3168 bs->drv->bdrv_invalidate_cache(bs);
3169 }
3170}
3171
3172void bdrv_invalidate_cache_all(void)
3173{
3174 BlockDriverState *bs;
3175
3176 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3177 bdrv_invalidate_cache(bs);
3178 }
3179}
3180
07f07615
PB
3181int bdrv_flush(BlockDriverState *bs)
3182{
3183 Coroutine *co;
3184 RwCo rwco = {
3185 .bs = bs,
3186 .ret = NOT_DONE,
e7a8a783 3187 };
e7a8a783 3188
07f07615
PB
3189 if (qemu_in_coroutine()) {
3190 /* Fast-path if already in coroutine context */
3191 bdrv_flush_co_entry(&rwco);
3192 } else {
3193 co = qemu_coroutine_create(bdrv_flush_co_entry);
3194 qemu_coroutine_enter(co, &rwco);
3195 while (rwco.ret == NOT_DONE) {
3196 qemu_aio_wait();
3197 }
e7a8a783 3198 }
07f07615
PB
3199
3200 return rwco.ret;
e7a8a783
KW
3201}
3202
4265d620
PB
3203static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3204{
3205 RwCo *rwco = opaque;
3206
3207 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3208}
3209
3210int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3211 int nb_sectors)
3212{
3213 if (!bs->drv) {
3214 return -ENOMEDIUM;
3215 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3216 return -EIO;
3217 } else if (bs->read_only) {
3218 return -EROFS;
3219 } else if (bs->drv->bdrv_co_discard) {
3220 return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3221 } else if (bs->drv->bdrv_aio_discard) {
3222 BlockDriverAIOCB *acb;
3223 CoroutineIOCompletion co = {
3224 .coroutine = qemu_coroutine_self(),
3225 };
3226
3227 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3228 bdrv_co_io_em_complete, &co);
3229 if (acb == NULL) {
3230 return -EIO;
3231 } else {
3232 qemu_coroutine_yield();
3233 return co.ret;
3234 }
4265d620
PB
3235 } else {
3236 return 0;
3237 }
3238}
3239
3240int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3241{
3242 Coroutine *co;
3243 RwCo rwco = {
3244 .bs = bs,
3245 .sector_num = sector_num,
3246 .nb_sectors = nb_sectors,
3247 .ret = NOT_DONE,
3248 };
3249
3250 if (qemu_in_coroutine()) {
3251 /* Fast-path if already in coroutine context */
3252 bdrv_discard_co_entry(&rwco);
3253 } else {
3254 co = qemu_coroutine_create(bdrv_discard_co_entry);
3255 qemu_coroutine_enter(co, &rwco);
3256 while (rwco.ret == NOT_DONE) {
3257 qemu_aio_wait();
3258 }
3259 }
3260
3261 return rwco.ret;
3262}
3263
19cb3738
FB
3264/**************************************************************/
3265/* removable device support */
3266
3267/**
3268 * Return TRUE if the media is present
3269 */
3270int bdrv_is_inserted(BlockDriverState *bs)
3271{
3272 BlockDriver *drv = bs->drv;
a1aff5bf 3273
19cb3738
FB
3274 if (!drv)
3275 return 0;
3276 if (!drv->bdrv_is_inserted)
a1aff5bf
MA
3277 return 1;
3278 return drv->bdrv_is_inserted(bs);
19cb3738
FB
3279}
3280
3281/**
8e49ca46
MA
3282 * Return whether the media changed since the last call to this
3283 * function, or -ENOTSUP if we don't know. Most drivers don't know.
19cb3738
FB
3284 */
3285int bdrv_media_changed(BlockDriverState *bs)
3286{
3287 BlockDriver *drv = bs->drv;
19cb3738 3288
8e49ca46
MA
3289 if (drv && drv->bdrv_media_changed) {
3290 return drv->bdrv_media_changed(bs);
3291 }
3292 return -ENOTSUP;
19cb3738
FB
3293}
3294
3295/**
3296 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3297 */
fdec4404 3298void bdrv_eject(BlockDriverState *bs, int eject_flag)
19cb3738
FB
3299{
3300 BlockDriver *drv = bs->drv;
19cb3738 3301
822e1cd1
MA
3302 if (drv && drv->bdrv_eject) {
3303 drv->bdrv_eject(bs, eject_flag);
19cb3738
FB
3304 }
3305}
3306
19cb3738
FB
3307/**
3308 * Lock or unlock the media (if it is locked, the user won't be able
3309 * to eject it manually).
3310 */
025e849a 3311void bdrv_lock_medium(BlockDriverState *bs, bool locked)
19cb3738
FB
3312{
3313 BlockDriver *drv = bs->drv;
3314
025e849a 3315 trace_bdrv_lock_medium(bs, locked);
b8c6d095 3316
025e849a
MA
3317 if (drv && drv->bdrv_lock_medium) {
3318 drv->bdrv_lock_medium(bs, locked);
19cb3738
FB
3319 }
3320}
985a03b0
TS
3321
3322/* needed for generic scsi interface */
3323
3324int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3325{
3326 BlockDriver *drv = bs->drv;
3327
3328 if (drv && drv->bdrv_ioctl)
3329 return drv->bdrv_ioctl(bs, req, buf);
3330 return -ENOTSUP;
3331}
7d780669 3332
221f715d
AL
3333BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3334 unsigned long int req, void *buf,
3335 BlockDriverCompletionFunc *cb, void *opaque)
7d780669 3336{
221f715d 3337 BlockDriver *drv = bs->drv;
7d780669 3338
221f715d
AL
3339 if (drv && drv->bdrv_aio_ioctl)
3340 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3341 return NULL;
7d780669 3342}
e268ca52 3343
7b6f9300
MA
3344void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3345{
3346 bs->buffer_alignment = align;
3347}
7cd1e32a 3348
e268ca52
AL
3349void *qemu_blockalign(BlockDriverState *bs, size_t size)
3350{
3351 return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3352}
7cd1e32a
LS
3353
3354void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3355{
3356 int64_t bitmap_size;
a55eb92c 3357
aaa0eb75 3358 bs->dirty_count = 0;
a55eb92c 3359 if (enable) {
c6d22830
JK
3360 if (!bs->dirty_bitmap) {
3361 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3362 BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
3363 bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
a55eb92c 3364
7267c094 3365 bs->dirty_bitmap = g_malloc0(bitmap_size);
a55eb92c 3366 }
7cd1e32a 3367 } else {
c6d22830 3368 if (bs->dirty_bitmap) {
7267c094 3369 g_free(bs->dirty_bitmap);
c6d22830 3370 bs->dirty_bitmap = NULL;
a55eb92c 3371 }
7cd1e32a
LS
3372 }
3373}
3374
3375int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3376{
6ea44308 3377 int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
a55eb92c 3378
c6d22830
JK
3379 if (bs->dirty_bitmap &&
3380 (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
6d59fec1
MT
3381 return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3382 (1UL << (chunk % (sizeof(unsigned long) * 8))));
7cd1e32a
LS
3383 } else {
3384 return 0;
3385 }
3386}
3387
a55eb92c
JK
3388void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3389 int nr_sectors)
7cd1e32a
LS
3390{
3391 set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3392}
aaa0eb75
LS
3393
3394int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3395{
3396 return bs->dirty_count;
3397}
f88e1a42 3398
db593f25
MT
3399void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3400{
3401 assert(bs->in_use != in_use);
3402 bs->in_use = in_use;
3403}
3404
3405int bdrv_in_use(BlockDriverState *bs)
3406{
3407 return bs->in_use;
3408}
3409
28a7282a
LC
3410void bdrv_iostatus_enable(BlockDriverState *bs)
3411{
d6bf279e 3412 bs->iostatus_enabled = true;
58e21ef5 3413 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
28a7282a
LC
3414}
3415
3416/* The I/O status is only enabled if the drive explicitly
3417 * enables it _and_ the VM is configured to stop on errors */
3418bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3419{
d6bf279e 3420 return (bs->iostatus_enabled &&
28a7282a
LC
3421 (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3422 bs->on_write_error == BLOCK_ERR_STOP_ANY ||
3423 bs->on_read_error == BLOCK_ERR_STOP_ANY));
3424}
3425
3426void bdrv_iostatus_disable(BlockDriverState *bs)
3427{
d6bf279e 3428 bs->iostatus_enabled = false;
28a7282a
LC
3429}
3430
3431void bdrv_iostatus_reset(BlockDriverState *bs)
3432{
3433 if (bdrv_iostatus_is_enabled(bs)) {
58e21ef5 3434 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
28a7282a
LC
3435 }
3436}
3437
3438/* XXX: Today this is set by device models because it makes the implementation
3439 quite simple. However, the block layer knows about the error, so it's
3440 possible to implement this without device models being involved */
3441void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3442{
58e21ef5
LC
3443 if (bdrv_iostatus_is_enabled(bs) &&
3444 bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
28a7282a 3445 assert(error >= 0);
58e21ef5
LC
3446 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
3447 BLOCK_DEVICE_IO_STATUS_FAILED;
28a7282a
LC
3448 }
3449}
3450
a597e79c
CH
3451void
3452bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
3453 enum BlockAcctType type)
3454{
3455 assert(type < BDRV_MAX_IOTYPE);
3456
3457 cookie->bytes = bytes;
c488c7f6 3458 cookie->start_time_ns = get_clock();
a597e79c
CH
3459 cookie->type = type;
3460}
3461
3462void
3463bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
3464{
3465 assert(cookie->type < BDRV_MAX_IOTYPE);
3466
3467 bs->nr_bytes[cookie->type] += cookie->bytes;
3468 bs->nr_ops[cookie->type]++;
c488c7f6 3469 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
a597e79c
CH
3470}
3471
f88e1a42
JS
3472int bdrv_img_create(const char *filename, const char *fmt,
3473 const char *base_filename, const char *base_fmt,
3474 char *options, uint64_t img_size, int flags)
3475{
3476 QEMUOptionParameter *param = NULL, *create_options = NULL;
d220894e 3477 QEMUOptionParameter *backing_fmt, *backing_file, *size;
f88e1a42
JS
3478 BlockDriverState *bs = NULL;
3479 BlockDriver *drv, *proto_drv;
96df67d1 3480 BlockDriver *backing_drv = NULL;
f88e1a42
JS
3481 int ret = 0;
3482
3483 /* Find driver and parse its options */
3484 drv = bdrv_find_format(fmt);
3485 if (!drv) {
3486 error_report("Unknown file format '%s'", fmt);
4f70f249 3487 ret = -EINVAL;
f88e1a42
JS
3488 goto out;
3489 }
3490
3491 proto_drv = bdrv_find_protocol(filename);
3492 if (!proto_drv) {
3493 error_report("Unknown protocol '%s'", filename);
4f70f249 3494 ret = -EINVAL;
f88e1a42
JS
3495 goto out;
3496 }
3497
3498 create_options = append_option_parameters(create_options,
3499 drv->create_options);
3500 create_options = append_option_parameters(create_options,
3501 proto_drv->create_options);
3502
3503 /* Create parameter list with default values */
3504 param = parse_option_parameters("", create_options, param);
3505
3506 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
3507
3508 /* Parse -o options */
3509 if (options) {
3510 param = parse_option_parameters(options, create_options, param);
3511 if (param == NULL) {
3512 error_report("Invalid options for file format '%s'.", fmt);
4f70f249 3513 ret = -EINVAL;
f88e1a42
JS
3514 goto out;
3515 }
3516 }
3517
3518 if (base_filename) {
3519 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
3520 base_filename)) {
3521 error_report("Backing file not supported for file format '%s'",
3522 fmt);
4f70f249 3523 ret = -EINVAL;
f88e1a42
JS
3524 goto out;
3525 }
3526 }
3527
3528 if (base_fmt) {
3529 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
3530 error_report("Backing file format not supported for file "
3531 "format '%s'", fmt);
4f70f249 3532 ret = -EINVAL;
f88e1a42
JS
3533 goto out;
3534 }
3535 }
3536
792da93a
JS
3537 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
3538 if (backing_file && backing_file->value.s) {
3539 if (!strcmp(filename, backing_file->value.s)) {
3540 error_report("Error: Trying to create an image with the "
3541 "same filename as the backing file");
4f70f249 3542 ret = -EINVAL;
792da93a
JS
3543 goto out;
3544 }
3545 }
3546
f88e1a42
JS
3547 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
3548 if (backing_fmt && backing_fmt->value.s) {
96df67d1
SH
3549 backing_drv = bdrv_find_format(backing_fmt->value.s);
3550 if (!backing_drv) {
f88e1a42
JS
3551 error_report("Unknown backing file format '%s'",
3552 backing_fmt->value.s);
4f70f249 3553 ret = -EINVAL;
f88e1a42
JS
3554 goto out;
3555 }
3556 }
3557
3558 // The size for the image must always be specified, with one exception:
3559 // If we are using a backing file, we can obtain the size from there
d220894e
KW
3560 size = get_option_parameter(param, BLOCK_OPT_SIZE);
3561 if (size && size->value.n == -1) {
f88e1a42
JS
3562 if (backing_file && backing_file->value.s) {
3563 uint64_t size;
f88e1a42
JS
3564 char buf[32];
3565
f88e1a42
JS
3566 bs = bdrv_new("");
3567
96df67d1 3568 ret = bdrv_open(bs, backing_file->value.s, flags, backing_drv);
f88e1a42 3569 if (ret < 0) {
96df67d1 3570 error_report("Could not open '%s'", backing_file->value.s);
f88e1a42
JS
3571 goto out;
3572 }
3573 bdrv_get_geometry(bs, &size);
3574 size *= 512;
3575
3576 snprintf(buf, sizeof(buf), "%" PRId64, size);
3577 set_option_parameter(param, BLOCK_OPT_SIZE, buf);
3578 } else {
3579 error_report("Image creation needs a size parameter");
4f70f249 3580 ret = -EINVAL;
f88e1a42
JS
3581 goto out;
3582 }
3583 }
3584
3585 printf("Formatting '%s', fmt=%s ", filename, fmt);
3586 print_option_parameters(param);
3587 puts("");
3588
3589 ret = bdrv_create(drv, filename, param);
3590
3591 if (ret < 0) {
3592 if (ret == -ENOTSUP) {
3593 error_report("Formatting or formatting option not supported for "
3594 "file format '%s'", fmt);
3595 } else if (ret == -EFBIG) {
3596 error_report("The image size is too large for file format '%s'",
3597 fmt);
3598 } else {
3599 error_report("%s: error while creating %s: %s", filename, fmt,
3600 strerror(-ret));
3601 }
3602 }
3603
3604out:
3605 free_option_parameters(create_options);
3606 free_option_parameters(param);
3607
3608 if (bs) {
3609 bdrv_delete(bs);
3610 }
4f70f249
JS
3611
3612 return ret;
f88e1a42 3613}