]> git.proxmox.com Git - qemu.git/blob - block.c
block: check bdrv_in_use() before blockdev operations
[qemu.git] / block.c
1 /*
2 * QEMU System Emulator block driver
3 *
4 * Copyright (c) 2003 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor.h"
28 #include "block_int.h"
29 #include "module.h"
30 #include "qjson.h"
31 #include "qemu-coroutine.h"
32 #include "qmp-commands.h"
33 #include "qemu-timer.h"
34
35 #ifdef CONFIG_BSD
36 #include <sys/types.h>
37 #include <sys/stat.h>
38 #include <sys/ioctl.h>
39 #include <sys/queue.h>
40 #ifndef __DragonFly__
41 #include <sys/disk.h>
42 #endif
43 #endif
44
45 #ifdef _WIN32
46 #include <windows.h>
47 #endif
48
49 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
50
51 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
52 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
53 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
54 BlockDriverCompletionFunc *cb, void *opaque);
55 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
56 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
57 BlockDriverCompletionFunc *cb, void *opaque);
58 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
59 int64_t sector_num, int nb_sectors,
60 QEMUIOVector *iov);
61 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
62 int64_t sector_num, int nb_sectors,
63 QEMUIOVector *iov);
64 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
65 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
66 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
67 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
68 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
69 int64_t sector_num,
70 QEMUIOVector *qiov,
71 int nb_sectors,
72 BlockDriverCompletionFunc *cb,
73 void *opaque,
74 bool is_write);
75 static void coroutine_fn bdrv_co_do_rw(void *opaque);
76
77 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
78 bool is_write, double elapsed_time, uint64_t *wait);
79 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
80 double elapsed_time, uint64_t *wait);
81 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
82 bool is_write, int64_t *wait);
83
84 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
85 QTAILQ_HEAD_INITIALIZER(bdrv_states);
86
87 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
88 QLIST_HEAD_INITIALIZER(bdrv_drivers);
89
90 /* The device to use for VM snapshots */
91 static BlockDriverState *bs_snapshots;
92
93 /* If non-zero, use only whitelisted block drivers */
94 static int use_bdrv_whitelist;
95
96 #ifdef _WIN32
97 static int is_windows_drive_prefix(const char *filename)
98 {
99 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
100 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
101 filename[1] == ':');
102 }
103
104 int is_windows_drive(const char *filename)
105 {
106 if (is_windows_drive_prefix(filename) &&
107 filename[2] == '\0')
108 return 1;
109 if (strstart(filename, "\\\\.\\", NULL) ||
110 strstart(filename, "//./", NULL))
111 return 1;
112 return 0;
113 }
114 #endif
115
116 /* throttling disk I/O limits */
117 void bdrv_io_limits_disable(BlockDriverState *bs)
118 {
119 bs->io_limits_enabled = false;
120
121 while (qemu_co_queue_next(&bs->throttled_reqs));
122
123 if (bs->block_timer) {
124 qemu_del_timer(bs->block_timer);
125 qemu_free_timer(bs->block_timer);
126 bs->block_timer = NULL;
127 }
128
129 bs->slice_start = 0;
130 bs->slice_end = 0;
131 bs->slice_time = 0;
132 memset(&bs->io_base, 0, sizeof(bs->io_base));
133 }
134
135 static void bdrv_block_timer(void *opaque)
136 {
137 BlockDriverState *bs = opaque;
138
139 qemu_co_queue_next(&bs->throttled_reqs);
140 }
141
142 void bdrv_io_limits_enable(BlockDriverState *bs)
143 {
144 qemu_co_queue_init(&bs->throttled_reqs);
145 bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
146 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
147 bs->slice_start = qemu_get_clock_ns(vm_clock);
148 bs->slice_end = bs->slice_start + bs->slice_time;
149 memset(&bs->io_base, 0, sizeof(bs->io_base));
150 bs->io_limits_enabled = true;
151 }
152
153 bool bdrv_io_limits_enabled(BlockDriverState *bs)
154 {
155 BlockIOLimit *io_limits = &bs->io_limits;
156 return io_limits->bps[BLOCK_IO_LIMIT_READ]
157 || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
158 || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
159 || io_limits->iops[BLOCK_IO_LIMIT_READ]
160 || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
161 || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
162 }
163
164 static void bdrv_io_limits_intercept(BlockDriverState *bs,
165 bool is_write, int nb_sectors)
166 {
167 int64_t wait_time = -1;
168
169 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
170 qemu_co_queue_wait(&bs->throttled_reqs);
171 }
172
173 /* In fact, we hope to keep each request's timing, in FIFO mode. The next
174 * throttled requests will not be dequeued until the current request is
175 * allowed to be serviced. So if the current request still exceeds the
176 * limits, it will be inserted to the head. All requests followed it will
177 * be still in throttled_reqs queue.
178 */
179
180 while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
181 qemu_mod_timer(bs->block_timer,
182 wait_time + qemu_get_clock_ns(vm_clock));
183 qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
184 }
185
186 qemu_co_queue_next(&bs->throttled_reqs);
187 }
188
189 /* check if the path starts with "<protocol>:" */
190 static int path_has_protocol(const char *path)
191 {
192 #ifdef _WIN32
193 if (is_windows_drive(path) ||
194 is_windows_drive_prefix(path)) {
195 return 0;
196 }
197 #endif
198
199 return strchr(path, ':') != NULL;
200 }
201
202 int path_is_absolute(const char *path)
203 {
204 const char *p;
205 #ifdef _WIN32
206 /* specific case for names like: "\\.\d:" */
207 if (*path == '/' || *path == '\\')
208 return 1;
209 #endif
210 p = strchr(path, ':');
211 if (p)
212 p++;
213 else
214 p = path;
215 #ifdef _WIN32
216 return (*p == '/' || *p == '\\');
217 #else
218 return (*p == '/');
219 #endif
220 }
221
222 /* if filename is absolute, just copy it to dest. Otherwise, build a
223 path to it by considering it is relative to base_path. URL are
224 supported. */
225 void path_combine(char *dest, int dest_size,
226 const char *base_path,
227 const char *filename)
228 {
229 const char *p, *p1;
230 int len;
231
232 if (dest_size <= 0)
233 return;
234 if (path_is_absolute(filename)) {
235 pstrcpy(dest, dest_size, filename);
236 } else {
237 p = strchr(base_path, ':');
238 if (p)
239 p++;
240 else
241 p = base_path;
242 p1 = strrchr(base_path, '/');
243 #ifdef _WIN32
244 {
245 const char *p2;
246 p2 = strrchr(base_path, '\\');
247 if (!p1 || p2 > p1)
248 p1 = p2;
249 }
250 #endif
251 if (p1)
252 p1++;
253 else
254 p1 = base_path;
255 if (p1 > p)
256 p = p1;
257 len = p - base_path;
258 if (len > dest_size - 1)
259 len = dest_size - 1;
260 memcpy(dest, base_path, len);
261 dest[len] = '\0';
262 pstrcat(dest, dest_size, filename);
263 }
264 }
265
266 void bdrv_register(BlockDriver *bdrv)
267 {
268 /* Block drivers without coroutine functions need emulation */
269 if (!bdrv->bdrv_co_readv) {
270 bdrv->bdrv_co_readv = bdrv_co_readv_em;
271 bdrv->bdrv_co_writev = bdrv_co_writev_em;
272
273 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
274 * the block driver lacks aio we need to emulate that too.
275 */
276 if (!bdrv->bdrv_aio_readv) {
277 /* add AIO emulation layer */
278 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
279 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
280 }
281 }
282
283 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
284 }
285
286 /* create a new block device (by default it is empty) */
287 BlockDriverState *bdrv_new(const char *device_name)
288 {
289 BlockDriverState *bs;
290
291 bs = g_malloc0(sizeof(BlockDriverState));
292 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
293 if (device_name[0] != '\0') {
294 QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
295 }
296 bdrv_iostatus_disable(bs);
297 return bs;
298 }
299
300 BlockDriver *bdrv_find_format(const char *format_name)
301 {
302 BlockDriver *drv1;
303 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
304 if (!strcmp(drv1->format_name, format_name)) {
305 return drv1;
306 }
307 }
308 return NULL;
309 }
310
311 static int bdrv_is_whitelisted(BlockDriver *drv)
312 {
313 static const char *whitelist[] = {
314 CONFIG_BDRV_WHITELIST
315 };
316 const char **p;
317
318 if (!whitelist[0])
319 return 1; /* no whitelist, anything goes */
320
321 for (p = whitelist; *p; p++) {
322 if (!strcmp(drv->format_name, *p)) {
323 return 1;
324 }
325 }
326 return 0;
327 }
328
329 BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
330 {
331 BlockDriver *drv = bdrv_find_format(format_name);
332 return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
333 }
334
335 int bdrv_create(BlockDriver *drv, const char* filename,
336 QEMUOptionParameter *options)
337 {
338 if (!drv->bdrv_create)
339 return -ENOTSUP;
340
341 return drv->bdrv_create(filename, options);
342 }
343
344 int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
345 {
346 BlockDriver *drv;
347
348 drv = bdrv_find_protocol(filename);
349 if (drv == NULL) {
350 return -ENOENT;
351 }
352
353 return bdrv_create(drv, filename, options);
354 }
355
356 #ifdef _WIN32
357 void get_tmp_filename(char *filename, int size)
358 {
359 char temp_dir[MAX_PATH];
360
361 GetTempPath(MAX_PATH, temp_dir);
362 GetTempFileName(temp_dir, "qem", 0, filename);
363 }
364 #else
365 void get_tmp_filename(char *filename, int size)
366 {
367 int fd;
368 const char *tmpdir;
369 /* XXX: race condition possible */
370 tmpdir = getenv("TMPDIR");
371 if (!tmpdir)
372 tmpdir = "/tmp";
373 snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
374 fd = mkstemp(filename);
375 close(fd);
376 }
377 #endif
378
379 /*
380 * Detect host devices. By convention, /dev/cdrom[N] is always
381 * recognized as a host CDROM.
382 */
383 static BlockDriver *find_hdev_driver(const char *filename)
384 {
385 int score_max = 0, score;
386 BlockDriver *drv = NULL, *d;
387
388 QLIST_FOREACH(d, &bdrv_drivers, list) {
389 if (d->bdrv_probe_device) {
390 score = d->bdrv_probe_device(filename);
391 if (score > score_max) {
392 score_max = score;
393 drv = d;
394 }
395 }
396 }
397
398 return drv;
399 }
400
401 BlockDriver *bdrv_find_protocol(const char *filename)
402 {
403 BlockDriver *drv1;
404 char protocol[128];
405 int len;
406 const char *p;
407
408 /* TODO Drivers without bdrv_file_open must be specified explicitly */
409
410 /*
411 * XXX(hch): we really should not let host device detection
412 * override an explicit protocol specification, but moving this
413 * later breaks access to device names with colons in them.
414 * Thanks to the brain-dead persistent naming schemes on udev-
415 * based Linux systems those actually are quite common.
416 */
417 drv1 = find_hdev_driver(filename);
418 if (drv1) {
419 return drv1;
420 }
421
422 if (!path_has_protocol(filename)) {
423 return bdrv_find_format("file");
424 }
425 p = strchr(filename, ':');
426 assert(p != NULL);
427 len = p - filename;
428 if (len > sizeof(protocol) - 1)
429 len = sizeof(protocol) - 1;
430 memcpy(protocol, filename, len);
431 protocol[len] = '\0';
432 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
433 if (drv1->protocol_name &&
434 !strcmp(drv1->protocol_name, protocol)) {
435 return drv1;
436 }
437 }
438 return NULL;
439 }
440
441 static int find_image_format(const char *filename, BlockDriver **pdrv)
442 {
443 int ret, score, score_max;
444 BlockDriver *drv1, *drv;
445 uint8_t buf[2048];
446 BlockDriverState *bs;
447
448 ret = bdrv_file_open(&bs, filename, 0);
449 if (ret < 0) {
450 *pdrv = NULL;
451 return ret;
452 }
453
454 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
455 if (bs->sg || !bdrv_is_inserted(bs)) {
456 bdrv_delete(bs);
457 drv = bdrv_find_format("raw");
458 if (!drv) {
459 ret = -ENOENT;
460 }
461 *pdrv = drv;
462 return ret;
463 }
464
465 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
466 bdrv_delete(bs);
467 if (ret < 0) {
468 *pdrv = NULL;
469 return ret;
470 }
471
472 score_max = 0;
473 drv = NULL;
474 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
475 if (drv1->bdrv_probe) {
476 score = drv1->bdrv_probe(buf, ret, filename);
477 if (score > score_max) {
478 score_max = score;
479 drv = drv1;
480 }
481 }
482 }
483 if (!drv) {
484 ret = -ENOENT;
485 }
486 *pdrv = drv;
487 return ret;
488 }
489
490 /**
491 * Set the current 'total_sectors' value
492 */
493 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
494 {
495 BlockDriver *drv = bs->drv;
496
497 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
498 if (bs->sg)
499 return 0;
500
501 /* query actual device if possible, otherwise just trust the hint */
502 if (drv->bdrv_getlength) {
503 int64_t length = drv->bdrv_getlength(bs);
504 if (length < 0) {
505 return length;
506 }
507 hint = length >> BDRV_SECTOR_BITS;
508 }
509
510 bs->total_sectors = hint;
511 return 0;
512 }
513
514 /**
515 * Set open flags for a given cache mode
516 *
517 * Return 0 on success, -1 if the cache mode was invalid.
518 */
519 int bdrv_parse_cache_flags(const char *mode, int *flags)
520 {
521 *flags &= ~BDRV_O_CACHE_MASK;
522
523 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
524 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
525 } else if (!strcmp(mode, "directsync")) {
526 *flags |= BDRV_O_NOCACHE;
527 } else if (!strcmp(mode, "writeback")) {
528 *flags |= BDRV_O_CACHE_WB;
529 } else if (!strcmp(mode, "unsafe")) {
530 *flags |= BDRV_O_CACHE_WB;
531 *flags |= BDRV_O_NO_FLUSH;
532 } else if (!strcmp(mode, "writethrough")) {
533 /* this is the default */
534 } else {
535 return -1;
536 }
537
538 return 0;
539 }
540
541 /**
542 * The copy-on-read flag is actually a reference count so multiple users may
543 * use the feature without worrying about clobbering its previous state.
544 * Copy-on-read stays enabled until all users have called to disable it.
545 */
546 void bdrv_enable_copy_on_read(BlockDriverState *bs)
547 {
548 bs->copy_on_read++;
549 }
550
551 void bdrv_disable_copy_on_read(BlockDriverState *bs)
552 {
553 assert(bs->copy_on_read > 0);
554 bs->copy_on_read--;
555 }
556
557 /*
558 * Common part for opening disk images and files
559 */
560 static int bdrv_open_common(BlockDriverState *bs, const char *filename,
561 int flags, BlockDriver *drv)
562 {
563 int ret, open_flags;
564
565 assert(drv != NULL);
566
567 trace_bdrv_open_common(bs, filename, flags, drv->format_name);
568
569 bs->file = NULL;
570 bs->total_sectors = 0;
571 bs->encrypted = 0;
572 bs->valid_key = 0;
573 bs->sg = 0;
574 bs->open_flags = flags;
575 bs->growable = 0;
576 bs->buffer_alignment = 512;
577
578 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
579 if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
580 bdrv_enable_copy_on_read(bs);
581 }
582
583 pstrcpy(bs->filename, sizeof(bs->filename), filename);
584 bs->backing_file[0] = '\0';
585
586 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
587 return -ENOTSUP;
588 }
589
590 bs->drv = drv;
591 bs->opaque = g_malloc0(drv->instance_size);
592
593 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
594
595 /*
596 * Clear flags that are internal to the block layer before opening the
597 * image.
598 */
599 open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
600
601 /*
602 * Snapshots should be writable.
603 */
604 if (bs->is_temporary) {
605 open_flags |= BDRV_O_RDWR;
606 }
607
608 bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
609
610 /* Open the image, either directly or using a protocol */
611 if (drv->bdrv_file_open) {
612 ret = drv->bdrv_file_open(bs, filename, open_flags);
613 } else {
614 ret = bdrv_file_open(&bs->file, filename, open_flags);
615 if (ret >= 0) {
616 ret = drv->bdrv_open(bs, open_flags);
617 }
618 }
619
620 if (ret < 0) {
621 goto free_and_fail;
622 }
623
624 ret = refresh_total_sectors(bs, bs->total_sectors);
625 if (ret < 0) {
626 goto free_and_fail;
627 }
628
629 #ifndef _WIN32
630 if (bs->is_temporary) {
631 unlink(filename);
632 }
633 #endif
634 return 0;
635
636 free_and_fail:
637 if (bs->file) {
638 bdrv_delete(bs->file);
639 bs->file = NULL;
640 }
641 g_free(bs->opaque);
642 bs->opaque = NULL;
643 bs->drv = NULL;
644 return ret;
645 }
646
647 /*
648 * Opens a file using a protocol (file, host_device, nbd, ...)
649 */
650 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
651 {
652 BlockDriverState *bs;
653 BlockDriver *drv;
654 int ret;
655
656 drv = bdrv_find_protocol(filename);
657 if (!drv) {
658 return -ENOENT;
659 }
660
661 bs = bdrv_new("");
662 ret = bdrv_open_common(bs, filename, flags, drv);
663 if (ret < 0) {
664 bdrv_delete(bs);
665 return ret;
666 }
667 bs->growable = 1;
668 *pbs = bs;
669 return 0;
670 }
671
672 /*
673 * Opens a disk image (raw, qcow2, vmdk, ...)
674 */
675 int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
676 BlockDriver *drv)
677 {
678 int ret;
679 char tmp_filename[PATH_MAX];
680
681 if (flags & BDRV_O_SNAPSHOT) {
682 BlockDriverState *bs1;
683 int64_t total_size;
684 int is_protocol = 0;
685 BlockDriver *bdrv_qcow2;
686 QEMUOptionParameter *options;
687 char backing_filename[PATH_MAX];
688
689 /* if snapshot, we create a temporary backing file and open it
690 instead of opening 'filename' directly */
691
692 /* if there is a backing file, use it */
693 bs1 = bdrv_new("");
694 ret = bdrv_open(bs1, filename, 0, drv);
695 if (ret < 0) {
696 bdrv_delete(bs1);
697 return ret;
698 }
699 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
700
701 if (bs1->drv && bs1->drv->protocol_name)
702 is_protocol = 1;
703
704 bdrv_delete(bs1);
705
706 get_tmp_filename(tmp_filename, sizeof(tmp_filename));
707
708 /* Real path is meaningless for protocols */
709 if (is_protocol)
710 snprintf(backing_filename, sizeof(backing_filename),
711 "%s", filename);
712 else if (!realpath(filename, backing_filename))
713 return -errno;
714
715 bdrv_qcow2 = bdrv_find_format("qcow2");
716 options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
717
718 set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
719 set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
720 if (drv) {
721 set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
722 drv->format_name);
723 }
724
725 ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
726 free_option_parameters(options);
727 if (ret < 0) {
728 return ret;
729 }
730
731 filename = tmp_filename;
732 drv = bdrv_qcow2;
733 bs->is_temporary = 1;
734 }
735
736 /* Find the right image format driver */
737 if (!drv) {
738 ret = find_image_format(filename, &drv);
739 }
740
741 if (!drv) {
742 goto unlink_and_fail;
743 }
744
745 /* Open the image */
746 ret = bdrv_open_common(bs, filename, flags, drv);
747 if (ret < 0) {
748 goto unlink_and_fail;
749 }
750
751 /* If there is a backing file, use it */
752 if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
753 char backing_filename[PATH_MAX];
754 int back_flags;
755 BlockDriver *back_drv = NULL;
756
757 bs->backing_hd = bdrv_new("");
758
759 if (path_has_protocol(bs->backing_file)) {
760 pstrcpy(backing_filename, sizeof(backing_filename),
761 bs->backing_file);
762 } else {
763 path_combine(backing_filename, sizeof(backing_filename),
764 filename, bs->backing_file);
765 }
766
767 if (bs->backing_format[0] != '\0') {
768 back_drv = bdrv_find_format(bs->backing_format);
769 }
770
771 /* backing files always opened read-only */
772 back_flags =
773 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
774
775 ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
776 if (ret < 0) {
777 bdrv_close(bs);
778 return ret;
779 }
780 if (bs->is_temporary) {
781 bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
782 } else {
783 /* base image inherits from "parent" */
784 bs->backing_hd->keep_read_only = bs->keep_read_only;
785 }
786 }
787
788 if (!bdrv_key_required(bs)) {
789 bdrv_dev_change_media_cb(bs, true);
790 }
791
792 /* throttling disk I/O limits */
793 if (bs->io_limits_enabled) {
794 bdrv_io_limits_enable(bs);
795 }
796
797 return 0;
798
799 unlink_and_fail:
800 if (bs->is_temporary) {
801 unlink(filename);
802 }
803 return ret;
804 }
805
806 void bdrv_close(BlockDriverState *bs)
807 {
808 if (bs->drv) {
809 if (bs == bs_snapshots) {
810 bs_snapshots = NULL;
811 }
812 if (bs->backing_hd) {
813 bdrv_delete(bs->backing_hd);
814 bs->backing_hd = NULL;
815 }
816 bs->drv->bdrv_close(bs);
817 g_free(bs->opaque);
818 #ifdef _WIN32
819 if (bs->is_temporary) {
820 unlink(bs->filename);
821 }
822 #endif
823 bs->opaque = NULL;
824 bs->drv = NULL;
825 bs->copy_on_read = 0;
826
827 if (bs->file != NULL) {
828 bdrv_close(bs->file);
829 }
830
831 bdrv_dev_change_media_cb(bs, false);
832 }
833
834 /*throttling disk I/O limits*/
835 if (bs->io_limits_enabled) {
836 bdrv_io_limits_disable(bs);
837 }
838 }
839
840 void bdrv_close_all(void)
841 {
842 BlockDriverState *bs;
843
844 QTAILQ_FOREACH(bs, &bdrv_states, list) {
845 bdrv_close(bs);
846 }
847 }
848
849 /*
850 * Wait for pending requests to complete across all BlockDriverStates
851 *
852 * This function does not flush data to disk, use bdrv_flush_all() for that
853 * after calling this function.
854 */
855 void bdrv_drain_all(void)
856 {
857 BlockDriverState *bs;
858
859 qemu_aio_flush();
860
861 /* If requests are still pending there is a bug somewhere */
862 QTAILQ_FOREACH(bs, &bdrv_states, list) {
863 assert(QLIST_EMPTY(&bs->tracked_requests));
864 assert(qemu_co_queue_empty(&bs->throttled_reqs));
865 }
866 }
867
868 /* make a BlockDriverState anonymous by removing from bdrv_state list.
869 Also, NULL terminate the device_name to prevent double remove */
870 void bdrv_make_anon(BlockDriverState *bs)
871 {
872 if (bs->device_name[0] != '\0') {
873 QTAILQ_REMOVE(&bdrv_states, bs, list);
874 }
875 bs->device_name[0] = '\0';
876 }
877
878 void bdrv_delete(BlockDriverState *bs)
879 {
880 assert(!bs->dev);
881
882 /* remove from list, if necessary */
883 bdrv_make_anon(bs);
884
885 bdrv_close(bs);
886 if (bs->file != NULL) {
887 bdrv_delete(bs->file);
888 }
889
890 assert(bs != bs_snapshots);
891 g_free(bs);
892 }
893
894 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
895 /* TODO change to DeviceState *dev when all users are qdevified */
896 {
897 if (bs->dev) {
898 return -EBUSY;
899 }
900 bs->dev = dev;
901 bdrv_iostatus_reset(bs);
902 return 0;
903 }
904
905 /* TODO qdevified devices don't use this, remove when devices are qdevified */
906 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
907 {
908 if (bdrv_attach_dev(bs, dev) < 0) {
909 abort();
910 }
911 }
912
913 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
914 /* TODO change to DeviceState *dev when all users are qdevified */
915 {
916 assert(bs->dev == dev);
917 bs->dev = NULL;
918 bs->dev_ops = NULL;
919 bs->dev_opaque = NULL;
920 bs->buffer_alignment = 512;
921 }
922
923 /* TODO change to return DeviceState * when all users are qdevified */
924 void *bdrv_get_attached_dev(BlockDriverState *bs)
925 {
926 return bs->dev;
927 }
928
929 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
930 void *opaque)
931 {
932 bs->dev_ops = ops;
933 bs->dev_opaque = opaque;
934 if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
935 bs_snapshots = NULL;
936 }
937 }
938
939 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
940 {
941 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
942 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
943 }
944 }
945
946 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
947 {
948 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
949 }
950
951 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
952 {
953 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
954 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
955 }
956 }
957
958 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
959 {
960 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
961 return bs->dev_ops->is_tray_open(bs->dev_opaque);
962 }
963 return false;
964 }
965
966 static void bdrv_dev_resize_cb(BlockDriverState *bs)
967 {
968 if (bs->dev_ops && bs->dev_ops->resize_cb) {
969 bs->dev_ops->resize_cb(bs->dev_opaque);
970 }
971 }
972
973 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
974 {
975 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
976 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
977 }
978 return false;
979 }
980
981 /*
982 * Run consistency checks on an image
983 *
984 * Returns 0 if the check could be completed (it doesn't mean that the image is
985 * free of errors) or -errno when an internal error occurred. The results of the
986 * check are stored in res.
987 */
988 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
989 {
990 if (bs->drv->bdrv_check == NULL) {
991 return -ENOTSUP;
992 }
993
994 memset(res, 0, sizeof(*res));
995 return bs->drv->bdrv_check(bs, res);
996 }
997
998 #define COMMIT_BUF_SECTORS 2048
999
1000 /* commit COW file into the raw image */
1001 int bdrv_commit(BlockDriverState *bs)
1002 {
1003 BlockDriver *drv = bs->drv;
1004 BlockDriver *backing_drv;
1005 int64_t sector, total_sectors;
1006 int n, ro, open_flags;
1007 int ret = 0, rw_ret = 0;
1008 uint8_t *buf;
1009 char filename[1024];
1010 BlockDriverState *bs_rw, *bs_ro;
1011
1012 if (!drv)
1013 return -ENOMEDIUM;
1014
1015 if (!bs->backing_hd) {
1016 return -ENOTSUP;
1017 }
1018
1019 if (bs->backing_hd->keep_read_only) {
1020 return -EACCES;
1021 }
1022
1023 if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1024 return -EBUSY;
1025 }
1026
1027 backing_drv = bs->backing_hd->drv;
1028 ro = bs->backing_hd->read_only;
1029 strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1030 open_flags = bs->backing_hd->open_flags;
1031
1032 if (ro) {
1033 /* re-open as RW */
1034 bdrv_delete(bs->backing_hd);
1035 bs->backing_hd = NULL;
1036 bs_rw = bdrv_new("");
1037 rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1038 backing_drv);
1039 if (rw_ret < 0) {
1040 bdrv_delete(bs_rw);
1041 /* try to re-open read-only */
1042 bs_ro = bdrv_new("");
1043 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1044 backing_drv);
1045 if (ret < 0) {
1046 bdrv_delete(bs_ro);
1047 /* drive not functional anymore */
1048 bs->drv = NULL;
1049 return ret;
1050 }
1051 bs->backing_hd = bs_ro;
1052 return rw_ret;
1053 }
1054 bs->backing_hd = bs_rw;
1055 }
1056
1057 total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1058 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1059
1060 for (sector = 0; sector < total_sectors; sector += n) {
1061 if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1062
1063 if (bdrv_read(bs, sector, buf, n) != 0) {
1064 ret = -EIO;
1065 goto ro_cleanup;
1066 }
1067
1068 if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1069 ret = -EIO;
1070 goto ro_cleanup;
1071 }
1072 }
1073 }
1074
1075 if (drv->bdrv_make_empty) {
1076 ret = drv->bdrv_make_empty(bs);
1077 bdrv_flush(bs);
1078 }
1079
1080 /*
1081 * Make sure all data we wrote to the backing device is actually
1082 * stable on disk.
1083 */
1084 if (bs->backing_hd)
1085 bdrv_flush(bs->backing_hd);
1086
1087 ro_cleanup:
1088 g_free(buf);
1089
1090 if (ro) {
1091 /* re-open as RO */
1092 bdrv_delete(bs->backing_hd);
1093 bs->backing_hd = NULL;
1094 bs_ro = bdrv_new("");
1095 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1096 backing_drv);
1097 if (ret < 0) {
1098 bdrv_delete(bs_ro);
1099 /* drive not functional anymore */
1100 bs->drv = NULL;
1101 return ret;
1102 }
1103 bs->backing_hd = bs_ro;
1104 bs->backing_hd->keep_read_only = 0;
1105 }
1106
1107 return ret;
1108 }
1109
1110 void bdrv_commit_all(void)
1111 {
1112 BlockDriverState *bs;
1113
1114 QTAILQ_FOREACH(bs, &bdrv_states, list) {
1115 bdrv_commit(bs);
1116 }
1117 }
1118
1119 struct BdrvTrackedRequest {
1120 BlockDriverState *bs;
1121 int64_t sector_num;
1122 int nb_sectors;
1123 bool is_write;
1124 QLIST_ENTRY(BdrvTrackedRequest) list;
1125 Coroutine *co; /* owner, used for deadlock detection */
1126 CoQueue wait_queue; /* coroutines blocked on this request */
1127 };
1128
1129 /**
1130 * Remove an active request from the tracked requests list
1131 *
1132 * This function should be called when a tracked request is completing.
1133 */
1134 static void tracked_request_end(BdrvTrackedRequest *req)
1135 {
1136 QLIST_REMOVE(req, list);
1137 qemu_co_queue_restart_all(&req->wait_queue);
1138 }
1139
1140 /**
1141 * Add an active request to the tracked requests list
1142 */
1143 static void tracked_request_begin(BdrvTrackedRequest *req,
1144 BlockDriverState *bs,
1145 int64_t sector_num,
1146 int nb_sectors, bool is_write)
1147 {
1148 *req = (BdrvTrackedRequest){
1149 .bs = bs,
1150 .sector_num = sector_num,
1151 .nb_sectors = nb_sectors,
1152 .is_write = is_write,
1153 .co = qemu_coroutine_self(),
1154 };
1155
1156 qemu_co_queue_init(&req->wait_queue);
1157
1158 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1159 }
1160
1161 /**
1162 * Round a region to cluster boundaries
1163 */
1164 static void round_to_clusters(BlockDriverState *bs,
1165 int64_t sector_num, int nb_sectors,
1166 int64_t *cluster_sector_num,
1167 int *cluster_nb_sectors)
1168 {
1169 BlockDriverInfo bdi;
1170
1171 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1172 *cluster_sector_num = sector_num;
1173 *cluster_nb_sectors = nb_sectors;
1174 } else {
1175 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1176 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1177 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1178 nb_sectors, c);
1179 }
1180 }
1181
1182 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1183 int64_t sector_num, int nb_sectors) {
1184 /* aaaa bbbb */
1185 if (sector_num >= req->sector_num + req->nb_sectors) {
1186 return false;
1187 }
1188 /* bbbb aaaa */
1189 if (req->sector_num >= sector_num + nb_sectors) {
1190 return false;
1191 }
1192 return true;
1193 }
1194
1195 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1196 int64_t sector_num, int nb_sectors)
1197 {
1198 BdrvTrackedRequest *req;
1199 int64_t cluster_sector_num;
1200 int cluster_nb_sectors;
1201 bool retry;
1202
1203 /* If we touch the same cluster it counts as an overlap. This guarantees
1204 * that allocating writes will be serialized and not race with each other
1205 * for the same cluster. For example, in copy-on-read it ensures that the
1206 * CoR read and write operations are atomic and guest writes cannot
1207 * interleave between them.
1208 */
1209 round_to_clusters(bs, sector_num, nb_sectors,
1210 &cluster_sector_num, &cluster_nb_sectors);
1211
1212 do {
1213 retry = false;
1214 QLIST_FOREACH(req, &bs->tracked_requests, list) {
1215 if (tracked_request_overlaps(req, cluster_sector_num,
1216 cluster_nb_sectors)) {
1217 /* Hitting this means there was a reentrant request, for
1218 * example, a block driver issuing nested requests. This must
1219 * never happen since it means deadlock.
1220 */
1221 assert(qemu_coroutine_self() != req->co);
1222
1223 qemu_co_queue_wait(&req->wait_queue);
1224 retry = true;
1225 break;
1226 }
1227 }
1228 } while (retry);
1229 }
1230
1231 /*
1232 * Return values:
1233 * 0 - success
1234 * -EINVAL - backing format specified, but no file
1235 * -ENOSPC - can't update the backing file because no space is left in the
1236 * image file header
1237 * -ENOTSUP - format driver doesn't support changing the backing file
1238 */
1239 int bdrv_change_backing_file(BlockDriverState *bs,
1240 const char *backing_file, const char *backing_fmt)
1241 {
1242 BlockDriver *drv = bs->drv;
1243
1244 if (drv->bdrv_change_backing_file != NULL) {
1245 return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1246 } else {
1247 return -ENOTSUP;
1248 }
1249 }
1250
1251 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1252 size_t size)
1253 {
1254 int64_t len;
1255
1256 if (!bdrv_is_inserted(bs))
1257 return -ENOMEDIUM;
1258
1259 if (bs->growable)
1260 return 0;
1261
1262 len = bdrv_getlength(bs);
1263
1264 if (offset < 0)
1265 return -EIO;
1266
1267 if ((offset > len) || (len - offset < size))
1268 return -EIO;
1269
1270 return 0;
1271 }
1272
1273 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1274 int nb_sectors)
1275 {
1276 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1277 nb_sectors * BDRV_SECTOR_SIZE);
1278 }
1279
1280 typedef struct RwCo {
1281 BlockDriverState *bs;
1282 int64_t sector_num;
1283 int nb_sectors;
1284 QEMUIOVector *qiov;
1285 bool is_write;
1286 int ret;
1287 } RwCo;
1288
1289 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1290 {
1291 RwCo *rwco = opaque;
1292
1293 if (!rwco->is_write) {
1294 rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1295 rwco->nb_sectors, rwco->qiov);
1296 } else {
1297 rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1298 rwco->nb_sectors, rwco->qiov);
1299 }
1300 }
1301
1302 /*
1303 * Process a synchronous request using coroutines
1304 */
1305 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1306 int nb_sectors, bool is_write)
1307 {
1308 QEMUIOVector qiov;
1309 struct iovec iov = {
1310 .iov_base = (void *)buf,
1311 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1312 };
1313 Coroutine *co;
1314 RwCo rwco = {
1315 .bs = bs,
1316 .sector_num = sector_num,
1317 .nb_sectors = nb_sectors,
1318 .qiov = &qiov,
1319 .is_write = is_write,
1320 .ret = NOT_DONE,
1321 };
1322
1323 qemu_iovec_init_external(&qiov, &iov, 1);
1324
1325 if (qemu_in_coroutine()) {
1326 /* Fast-path if already in coroutine context */
1327 bdrv_rw_co_entry(&rwco);
1328 } else {
1329 co = qemu_coroutine_create(bdrv_rw_co_entry);
1330 qemu_coroutine_enter(co, &rwco);
1331 while (rwco.ret == NOT_DONE) {
1332 qemu_aio_wait();
1333 }
1334 }
1335 return rwco.ret;
1336 }
1337
1338 /* return < 0 if error. See bdrv_write() for the return codes */
1339 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1340 uint8_t *buf, int nb_sectors)
1341 {
1342 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1343 }
1344
1345 static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1346 int nb_sectors, int dirty)
1347 {
1348 int64_t start, end;
1349 unsigned long val, idx, bit;
1350
1351 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1352 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1353
1354 for (; start <= end; start++) {
1355 idx = start / (sizeof(unsigned long) * 8);
1356 bit = start % (sizeof(unsigned long) * 8);
1357 val = bs->dirty_bitmap[idx];
1358 if (dirty) {
1359 if (!(val & (1UL << bit))) {
1360 bs->dirty_count++;
1361 val |= 1UL << bit;
1362 }
1363 } else {
1364 if (val & (1UL << bit)) {
1365 bs->dirty_count--;
1366 val &= ~(1UL << bit);
1367 }
1368 }
1369 bs->dirty_bitmap[idx] = val;
1370 }
1371 }
1372
1373 /* Return < 0 if error. Important errors are:
1374 -EIO generic I/O error (may happen for all errors)
1375 -ENOMEDIUM No media inserted.
1376 -EINVAL Invalid sector number or nb_sectors
1377 -EACCES Trying to write a read-only device
1378 */
1379 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1380 const uint8_t *buf, int nb_sectors)
1381 {
1382 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1383 }
1384
1385 int bdrv_pread(BlockDriverState *bs, int64_t offset,
1386 void *buf, int count1)
1387 {
1388 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1389 int len, nb_sectors, count;
1390 int64_t sector_num;
1391 int ret;
1392
1393 count = count1;
1394 /* first read to align to sector start */
1395 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1396 if (len > count)
1397 len = count;
1398 sector_num = offset >> BDRV_SECTOR_BITS;
1399 if (len > 0) {
1400 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1401 return ret;
1402 memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1403 count -= len;
1404 if (count == 0)
1405 return count1;
1406 sector_num++;
1407 buf += len;
1408 }
1409
1410 /* read the sectors "in place" */
1411 nb_sectors = count >> BDRV_SECTOR_BITS;
1412 if (nb_sectors > 0) {
1413 if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1414 return ret;
1415 sector_num += nb_sectors;
1416 len = nb_sectors << BDRV_SECTOR_BITS;
1417 buf += len;
1418 count -= len;
1419 }
1420
1421 /* add data from the last sector */
1422 if (count > 0) {
1423 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1424 return ret;
1425 memcpy(buf, tmp_buf, count);
1426 }
1427 return count1;
1428 }
1429
1430 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1431 const void *buf, int count1)
1432 {
1433 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1434 int len, nb_sectors, count;
1435 int64_t sector_num;
1436 int ret;
1437
1438 count = count1;
1439 /* first write to align to sector start */
1440 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1441 if (len > count)
1442 len = count;
1443 sector_num = offset >> BDRV_SECTOR_BITS;
1444 if (len > 0) {
1445 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1446 return ret;
1447 memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1448 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1449 return ret;
1450 count -= len;
1451 if (count == 0)
1452 return count1;
1453 sector_num++;
1454 buf += len;
1455 }
1456
1457 /* write the sectors "in place" */
1458 nb_sectors = count >> BDRV_SECTOR_BITS;
1459 if (nb_sectors > 0) {
1460 if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1461 return ret;
1462 sector_num += nb_sectors;
1463 len = nb_sectors << BDRV_SECTOR_BITS;
1464 buf += len;
1465 count -= len;
1466 }
1467
1468 /* add data from the last sector */
1469 if (count > 0) {
1470 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1471 return ret;
1472 memcpy(tmp_buf, buf, count);
1473 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1474 return ret;
1475 }
1476 return count1;
1477 }
1478
1479 /*
1480 * Writes to the file and ensures that no writes are reordered across this
1481 * request (acts as a barrier)
1482 *
1483 * Returns 0 on success, -errno in error cases.
1484 */
1485 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1486 const void *buf, int count)
1487 {
1488 int ret;
1489
1490 ret = bdrv_pwrite(bs, offset, buf, count);
1491 if (ret < 0) {
1492 return ret;
1493 }
1494
1495 /* No flush needed for cache modes that use O_DSYNC */
1496 if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
1497 bdrv_flush(bs);
1498 }
1499
1500 return 0;
1501 }
1502
1503 static int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1504 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1505 {
1506 /* Perform I/O through a temporary buffer so that users who scribble over
1507 * their read buffer while the operation is in progress do not end up
1508 * modifying the image file. This is critical for zero-copy guest I/O
1509 * where anything might happen inside guest memory.
1510 */
1511 void *bounce_buffer;
1512
1513 struct iovec iov;
1514 QEMUIOVector bounce_qiov;
1515 int64_t cluster_sector_num;
1516 int cluster_nb_sectors;
1517 size_t skip_bytes;
1518 int ret;
1519
1520 /* Cover entire cluster so no additional backing file I/O is required when
1521 * allocating cluster in the image file.
1522 */
1523 round_to_clusters(bs, sector_num, nb_sectors,
1524 &cluster_sector_num, &cluster_nb_sectors);
1525
1526 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors,
1527 cluster_sector_num, cluster_nb_sectors);
1528
1529 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1530 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1531 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1532
1533 ret = bs->drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1534 &bounce_qiov);
1535 if (ret < 0) {
1536 goto err;
1537 }
1538
1539 ret = bs->drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
1540 &bounce_qiov);
1541 if (ret < 0) {
1542 /* It might be okay to ignore write errors for guest requests. If this
1543 * is a deliberate copy-on-read then we don't want to ignore the error.
1544 * Simply report it in all cases.
1545 */
1546 goto err;
1547 }
1548
1549 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1550 qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
1551 nb_sectors * BDRV_SECTOR_SIZE);
1552
1553 err:
1554 qemu_vfree(bounce_buffer);
1555 return ret;
1556 }
1557
1558 /*
1559 * Handle a read request in coroutine context
1560 */
1561 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1562 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1563 {
1564 BlockDriver *drv = bs->drv;
1565 BdrvTrackedRequest req;
1566 int ret;
1567
1568 if (!drv) {
1569 return -ENOMEDIUM;
1570 }
1571 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1572 return -EIO;
1573 }
1574
1575 /* throttling disk read I/O */
1576 if (bs->io_limits_enabled) {
1577 bdrv_io_limits_intercept(bs, false, nb_sectors);
1578 }
1579
1580 if (bs->copy_on_read) {
1581 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1582 }
1583
1584 tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
1585
1586 if (bs->copy_on_read) {
1587 int pnum;
1588
1589 ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1590 if (ret < 0) {
1591 goto out;
1592 }
1593
1594 if (!ret || pnum != nb_sectors) {
1595 ret = bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, qiov);
1596 goto out;
1597 }
1598 }
1599
1600 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1601
1602 out:
1603 tracked_request_end(&req);
1604 return ret;
1605 }
1606
1607 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1608 int nb_sectors, QEMUIOVector *qiov)
1609 {
1610 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1611
1612 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov);
1613 }
1614
1615 /*
1616 * Handle a write request in coroutine context
1617 */
1618 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1619 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1620 {
1621 BlockDriver *drv = bs->drv;
1622 BdrvTrackedRequest req;
1623 int ret;
1624
1625 if (!bs->drv) {
1626 return -ENOMEDIUM;
1627 }
1628 if (bs->read_only) {
1629 return -EACCES;
1630 }
1631 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1632 return -EIO;
1633 }
1634
1635 /* throttling disk write I/O */
1636 if (bs->io_limits_enabled) {
1637 bdrv_io_limits_intercept(bs, true, nb_sectors);
1638 }
1639
1640 if (bs->copy_on_read) {
1641 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1642 }
1643
1644 tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
1645
1646 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1647
1648 if (bs->dirty_bitmap) {
1649 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1650 }
1651
1652 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1653 bs->wr_highest_sector = sector_num + nb_sectors - 1;
1654 }
1655
1656 tracked_request_end(&req);
1657
1658 return ret;
1659 }
1660
1661 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1662 int nb_sectors, QEMUIOVector *qiov)
1663 {
1664 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1665
1666 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov);
1667 }
1668
1669 /**
1670 * Truncate file to 'offset' bytes (needed only for file protocols)
1671 */
1672 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1673 {
1674 BlockDriver *drv = bs->drv;
1675 int ret;
1676 if (!drv)
1677 return -ENOMEDIUM;
1678 if (!drv->bdrv_truncate)
1679 return -ENOTSUP;
1680 if (bs->read_only)
1681 return -EACCES;
1682 if (bdrv_in_use(bs))
1683 return -EBUSY;
1684 ret = drv->bdrv_truncate(bs, offset);
1685 if (ret == 0) {
1686 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
1687 bdrv_dev_resize_cb(bs);
1688 }
1689 return ret;
1690 }
1691
1692 /**
1693 * Length of a allocated file in bytes. Sparse files are counted by actual
1694 * allocated space. Return < 0 if error or unknown.
1695 */
1696 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
1697 {
1698 BlockDriver *drv = bs->drv;
1699 if (!drv) {
1700 return -ENOMEDIUM;
1701 }
1702 if (drv->bdrv_get_allocated_file_size) {
1703 return drv->bdrv_get_allocated_file_size(bs);
1704 }
1705 if (bs->file) {
1706 return bdrv_get_allocated_file_size(bs->file);
1707 }
1708 return -ENOTSUP;
1709 }
1710
1711 /**
1712 * Length of a file in bytes. Return < 0 if error or unknown.
1713 */
1714 int64_t bdrv_getlength(BlockDriverState *bs)
1715 {
1716 BlockDriver *drv = bs->drv;
1717 if (!drv)
1718 return -ENOMEDIUM;
1719
1720 if (bs->growable || bdrv_dev_has_removable_media(bs)) {
1721 if (drv->bdrv_getlength) {
1722 return drv->bdrv_getlength(bs);
1723 }
1724 }
1725 return bs->total_sectors * BDRV_SECTOR_SIZE;
1726 }
1727
1728 /* return 0 as number of sectors if no device present or error */
1729 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
1730 {
1731 int64_t length;
1732 length = bdrv_getlength(bs);
1733 if (length < 0)
1734 length = 0;
1735 else
1736 length = length >> BDRV_SECTOR_BITS;
1737 *nb_sectors_ptr = length;
1738 }
1739
1740 struct partition {
1741 uint8_t boot_ind; /* 0x80 - active */
1742 uint8_t head; /* starting head */
1743 uint8_t sector; /* starting sector */
1744 uint8_t cyl; /* starting cylinder */
1745 uint8_t sys_ind; /* What partition type */
1746 uint8_t end_head; /* end head */
1747 uint8_t end_sector; /* end sector */
1748 uint8_t end_cyl; /* end cylinder */
1749 uint32_t start_sect; /* starting sector counting from 0 */
1750 uint32_t nr_sects; /* nr of sectors in partition */
1751 } QEMU_PACKED;
1752
1753 /* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
1754 static int guess_disk_lchs(BlockDriverState *bs,
1755 int *pcylinders, int *pheads, int *psectors)
1756 {
1757 uint8_t buf[BDRV_SECTOR_SIZE];
1758 int ret, i, heads, sectors, cylinders;
1759 struct partition *p;
1760 uint32_t nr_sects;
1761 uint64_t nb_sectors;
1762
1763 bdrv_get_geometry(bs, &nb_sectors);
1764
1765 ret = bdrv_read(bs, 0, buf, 1);
1766 if (ret < 0)
1767 return -1;
1768 /* test msdos magic */
1769 if (buf[510] != 0x55 || buf[511] != 0xaa)
1770 return -1;
1771 for(i = 0; i < 4; i++) {
1772 p = ((struct partition *)(buf + 0x1be)) + i;
1773 nr_sects = le32_to_cpu(p->nr_sects);
1774 if (nr_sects && p->end_head) {
1775 /* We make the assumption that the partition terminates on
1776 a cylinder boundary */
1777 heads = p->end_head + 1;
1778 sectors = p->end_sector & 63;
1779 if (sectors == 0)
1780 continue;
1781 cylinders = nb_sectors / (heads * sectors);
1782 if (cylinders < 1 || cylinders > 16383)
1783 continue;
1784 *pheads = heads;
1785 *psectors = sectors;
1786 *pcylinders = cylinders;
1787 #if 0
1788 printf("guessed geometry: LCHS=%d %d %d\n",
1789 cylinders, heads, sectors);
1790 #endif
1791 return 0;
1792 }
1793 }
1794 return -1;
1795 }
1796
1797 void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
1798 {
1799 int translation, lba_detected = 0;
1800 int cylinders, heads, secs;
1801 uint64_t nb_sectors;
1802
1803 /* if a geometry hint is available, use it */
1804 bdrv_get_geometry(bs, &nb_sectors);
1805 bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
1806 translation = bdrv_get_translation_hint(bs);
1807 if (cylinders != 0) {
1808 *pcyls = cylinders;
1809 *pheads = heads;
1810 *psecs = secs;
1811 } else {
1812 if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
1813 if (heads > 16) {
1814 /* if heads > 16, it means that a BIOS LBA
1815 translation was active, so the default
1816 hardware geometry is OK */
1817 lba_detected = 1;
1818 goto default_geometry;
1819 } else {
1820 *pcyls = cylinders;
1821 *pheads = heads;
1822 *psecs = secs;
1823 /* disable any translation to be in sync with
1824 the logical geometry */
1825 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
1826 bdrv_set_translation_hint(bs,
1827 BIOS_ATA_TRANSLATION_NONE);
1828 }
1829 }
1830 } else {
1831 default_geometry:
1832 /* if no geometry, use a standard physical disk geometry */
1833 cylinders = nb_sectors / (16 * 63);
1834
1835 if (cylinders > 16383)
1836 cylinders = 16383;
1837 else if (cylinders < 2)
1838 cylinders = 2;
1839 *pcyls = cylinders;
1840 *pheads = 16;
1841 *psecs = 63;
1842 if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
1843 if ((*pcyls * *pheads) <= 131072) {
1844 bdrv_set_translation_hint(bs,
1845 BIOS_ATA_TRANSLATION_LARGE);
1846 } else {
1847 bdrv_set_translation_hint(bs,
1848 BIOS_ATA_TRANSLATION_LBA);
1849 }
1850 }
1851 }
1852 bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
1853 }
1854 }
1855
1856 void bdrv_set_geometry_hint(BlockDriverState *bs,
1857 int cyls, int heads, int secs)
1858 {
1859 bs->cyls = cyls;
1860 bs->heads = heads;
1861 bs->secs = secs;
1862 }
1863
1864 void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
1865 {
1866 bs->translation = translation;
1867 }
1868
1869 void bdrv_get_geometry_hint(BlockDriverState *bs,
1870 int *pcyls, int *pheads, int *psecs)
1871 {
1872 *pcyls = bs->cyls;
1873 *pheads = bs->heads;
1874 *psecs = bs->secs;
1875 }
1876
1877 /* throttling disk io limits */
1878 void bdrv_set_io_limits(BlockDriverState *bs,
1879 BlockIOLimit *io_limits)
1880 {
1881 bs->io_limits = *io_limits;
1882 bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
1883 }
1884
1885 /* Recognize floppy formats */
1886 typedef struct FDFormat {
1887 FDriveType drive;
1888 uint8_t last_sect;
1889 uint8_t max_track;
1890 uint8_t max_head;
1891 } FDFormat;
1892
1893 static const FDFormat fd_formats[] = {
1894 /* First entry is default format */
1895 /* 1.44 MB 3"1/2 floppy disks */
1896 { FDRIVE_DRV_144, 18, 80, 1, },
1897 { FDRIVE_DRV_144, 20, 80, 1, },
1898 { FDRIVE_DRV_144, 21, 80, 1, },
1899 { FDRIVE_DRV_144, 21, 82, 1, },
1900 { FDRIVE_DRV_144, 21, 83, 1, },
1901 { FDRIVE_DRV_144, 22, 80, 1, },
1902 { FDRIVE_DRV_144, 23, 80, 1, },
1903 { FDRIVE_DRV_144, 24, 80, 1, },
1904 /* 2.88 MB 3"1/2 floppy disks */
1905 { FDRIVE_DRV_288, 36, 80, 1, },
1906 { FDRIVE_DRV_288, 39, 80, 1, },
1907 { FDRIVE_DRV_288, 40, 80, 1, },
1908 { FDRIVE_DRV_288, 44, 80, 1, },
1909 { FDRIVE_DRV_288, 48, 80, 1, },
1910 /* 720 kB 3"1/2 floppy disks */
1911 { FDRIVE_DRV_144, 9, 80, 1, },
1912 { FDRIVE_DRV_144, 10, 80, 1, },
1913 { FDRIVE_DRV_144, 10, 82, 1, },
1914 { FDRIVE_DRV_144, 10, 83, 1, },
1915 { FDRIVE_DRV_144, 13, 80, 1, },
1916 { FDRIVE_DRV_144, 14, 80, 1, },
1917 /* 1.2 MB 5"1/4 floppy disks */
1918 { FDRIVE_DRV_120, 15, 80, 1, },
1919 { FDRIVE_DRV_120, 18, 80, 1, },
1920 { FDRIVE_DRV_120, 18, 82, 1, },
1921 { FDRIVE_DRV_120, 18, 83, 1, },
1922 { FDRIVE_DRV_120, 20, 80, 1, },
1923 /* 720 kB 5"1/4 floppy disks */
1924 { FDRIVE_DRV_120, 9, 80, 1, },
1925 { FDRIVE_DRV_120, 11, 80, 1, },
1926 /* 360 kB 5"1/4 floppy disks */
1927 { FDRIVE_DRV_120, 9, 40, 1, },
1928 { FDRIVE_DRV_120, 9, 40, 0, },
1929 { FDRIVE_DRV_120, 10, 41, 1, },
1930 { FDRIVE_DRV_120, 10, 42, 1, },
1931 /* 320 kB 5"1/4 floppy disks */
1932 { FDRIVE_DRV_120, 8, 40, 1, },
1933 { FDRIVE_DRV_120, 8, 40, 0, },
1934 /* 360 kB must match 5"1/4 better than 3"1/2... */
1935 { FDRIVE_DRV_144, 9, 80, 0, },
1936 /* end */
1937 { FDRIVE_DRV_NONE, -1, -1, 0, },
1938 };
1939
1940 void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
1941 int *max_track, int *last_sect,
1942 FDriveType drive_in, FDriveType *drive)
1943 {
1944 const FDFormat *parse;
1945 uint64_t nb_sectors, size;
1946 int i, first_match, match;
1947
1948 bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
1949 if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
1950 /* User defined disk */
1951 } else {
1952 bdrv_get_geometry(bs, &nb_sectors);
1953 match = -1;
1954 first_match = -1;
1955 for (i = 0; ; i++) {
1956 parse = &fd_formats[i];
1957 if (parse->drive == FDRIVE_DRV_NONE) {
1958 break;
1959 }
1960 if (drive_in == parse->drive ||
1961 drive_in == FDRIVE_DRV_NONE) {
1962 size = (parse->max_head + 1) * parse->max_track *
1963 parse->last_sect;
1964 if (nb_sectors == size) {
1965 match = i;
1966 break;
1967 }
1968 if (first_match == -1) {
1969 first_match = i;
1970 }
1971 }
1972 }
1973 if (match == -1) {
1974 if (first_match == -1) {
1975 match = 1;
1976 } else {
1977 match = first_match;
1978 }
1979 parse = &fd_formats[match];
1980 }
1981 *nb_heads = parse->max_head + 1;
1982 *max_track = parse->max_track;
1983 *last_sect = parse->last_sect;
1984 *drive = parse->drive;
1985 }
1986 }
1987
1988 int bdrv_get_translation_hint(BlockDriverState *bs)
1989 {
1990 return bs->translation;
1991 }
1992
1993 void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
1994 BlockErrorAction on_write_error)
1995 {
1996 bs->on_read_error = on_read_error;
1997 bs->on_write_error = on_write_error;
1998 }
1999
2000 BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2001 {
2002 return is_read ? bs->on_read_error : bs->on_write_error;
2003 }
2004
2005 int bdrv_is_read_only(BlockDriverState *bs)
2006 {
2007 return bs->read_only;
2008 }
2009
2010 int bdrv_is_sg(BlockDriverState *bs)
2011 {
2012 return bs->sg;
2013 }
2014
2015 int bdrv_enable_write_cache(BlockDriverState *bs)
2016 {
2017 return bs->enable_write_cache;
2018 }
2019
2020 int bdrv_is_encrypted(BlockDriverState *bs)
2021 {
2022 if (bs->backing_hd && bs->backing_hd->encrypted)
2023 return 1;
2024 return bs->encrypted;
2025 }
2026
2027 int bdrv_key_required(BlockDriverState *bs)
2028 {
2029 BlockDriverState *backing_hd = bs->backing_hd;
2030
2031 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2032 return 1;
2033 return (bs->encrypted && !bs->valid_key);
2034 }
2035
2036 int bdrv_set_key(BlockDriverState *bs, const char *key)
2037 {
2038 int ret;
2039 if (bs->backing_hd && bs->backing_hd->encrypted) {
2040 ret = bdrv_set_key(bs->backing_hd, key);
2041 if (ret < 0)
2042 return ret;
2043 if (!bs->encrypted)
2044 return 0;
2045 }
2046 if (!bs->encrypted) {
2047 return -EINVAL;
2048 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2049 return -ENOMEDIUM;
2050 }
2051 ret = bs->drv->bdrv_set_key(bs, key);
2052 if (ret < 0) {
2053 bs->valid_key = 0;
2054 } else if (!bs->valid_key) {
2055 bs->valid_key = 1;
2056 /* call the change callback now, we skipped it on open */
2057 bdrv_dev_change_media_cb(bs, true);
2058 }
2059 return ret;
2060 }
2061
2062 void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
2063 {
2064 if (!bs->drv) {
2065 buf[0] = '\0';
2066 } else {
2067 pstrcpy(buf, buf_size, bs->drv->format_name);
2068 }
2069 }
2070
2071 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2072 void *opaque)
2073 {
2074 BlockDriver *drv;
2075
2076 QLIST_FOREACH(drv, &bdrv_drivers, list) {
2077 it(opaque, drv->format_name);
2078 }
2079 }
2080
2081 BlockDriverState *bdrv_find(const char *name)
2082 {
2083 BlockDriverState *bs;
2084
2085 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2086 if (!strcmp(name, bs->device_name)) {
2087 return bs;
2088 }
2089 }
2090 return NULL;
2091 }
2092
2093 BlockDriverState *bdrv_next(BlockDriverState *bs)
2094 {
2095 if (!bs) {
2096 return QTAILQ_FIRST(&bdrv_states);
2097 }
2098 return QTAILQ_NEXT(bs, list);
2099 }
2100
2101 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2102 {
2103 BlockDriverState *bs;
2104
2105 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2106 it(opaque, bs);
2107 }
2108 }
2109
2110 const char *bdrv_get_device_name(BlockDriverState *bs)
2111 {
2112 return bs->device_name;
2113 }
2114
2115 void bdrv_flush_all(void)
2116 {
2117 BlockDriverState *bs;
2118
2119 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2120 if (!bdrv_is_read_only(bs) && bdrv_is_inserted(bs)) {
2121 bdrv_flush(bs);
2122 }
2123 }
2124 }
2125
2126 int bdrv_has_zero_init(BlockDriverState *bs)
2127 {
2128 assert(bs->drv);
2129
2130 if (bs->drv->bdrv_has_zero_init) {
2131 return bs->drv->bdrv_has_zero_init(bs);
2132 }
2133
2134 return 1;
2135 }
2136
2137 typedef struct BdrvCoIsAllocatedData {
2138 BlockDriverState *bs;
2139 int64_t sector_num;
2140 int nb_sectors;
2141 int *pnum;
2142 int ret;
2143 bool done;
2144 } BdrvCoIsAllocatedData;
2145
2146 /*
2147 * Returns true iff the specified sector is present in the disk image. Drivers
2148 * not implementing the functionality are assumed to not support backing files,
2149 * hence all their sectors are reported as allocated.
2150 *
2151 * If 'sector_num' is beyond the end of the disk image the return value is 0
2152 * and 'pnum' is set to 0.
2153 *
2154 * 'pnum' is set to the number of sectors (including and immediately following
2155 * the specified sector) that are known to be in the same
2156 * allocated/unallocated state.
2157 *
2158 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
2159 * beyond the end of the disk image it will be clamped.
2160 */
2161 int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2162 int nb_sectors, int *pnum)
2163 {
2164 int64_t n;
2165
2166 if (sector_num >= bs->total_sectors) {
2167 *pnum = 0;
2168 return 0;
2169 }
2170
2171 n = bs->total_sectors - sector_num;
2172 if (n < nb_sectors) {
2173 nb_sectors = n;
2174 }
2175
2176 if (!bs->drv->bdrv_co_is_allocated) {
2177 *pnum = nb_sectors;
2178 return 1;
2179 }
2180
2181 return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2182 }
2183
2184 /* Coroutine wrapper for bdrv_is_allocated() */
2185 static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2186 {
2187 BdrvCoIsAllocatedData *data = opaque;
2188 BlockDriverState *bs = data->bs;
2189
2190 data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2191 data->pnum);
2192 data->done = true;
2193 }
2194
2195 /*
2196 * Synchronous wrapper around bdrv_co_is_allocated().
2197 *
2198 * See bdrv_co_is_allocated() for details.
2199 */
2200 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2201 int *pnum)
2202 {
2203 Coroutine *co;
2204 BdrvCoIsAllocatedData data = {
2205 .bs = bs,
2206 .sector_num = sector_num,
2207 .nb_sectors = nb_sectors,
2208 .pnum = pnum,
2209 .done = false,
2210 };
2211
2212 co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2213 qemu_coroutine_enter(co, &data);
2214 while (!data.done) {
2215 qemu_aio_wait();
2216 }
2217 return data.ret;
2218 }
2219
2220 void bdrv_mon_event(const BlockDriverState *bdrv,
2221 BlockMonEventAction action, int is_read)
2222 {
2223 QObject *data;
2224 const char *action_str;
2225
2226 switch (action) {
2227 case BDRV_ACTION_REPORT:
2228 action_str = "report";
2229 break;
2230 case BDRV_ACTION_IGNORE:
2231 action_str = "ignore";
2232 break;
2233 case BDRV_ACTION_STOP:
2234 action_str = "stop";
2235 break;
2236 default:
2237 abort();
2238 }
2239
2240 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
2241 bdrv->device_name,
2242 action_str,
2243 is_read ? "read" : "write");
2244 monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
2245
2246 qobject_decref(data);
2247 }
2248
2249 BlockInfoList *qmp_query_block(Error **errp)
2250 {
2251 BlockInfoList *head = NULL, *cur_item = NULL;
2252 BlockDriverState *bs;
2253
2254 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2255 BlockInfoList *info = g_malloc0(sizeof(*info));
2256
2257 info->value = g_malloc0(sizeof(*info->value));
2258 info->value->device = g_strdup(bs->device_name);
2259 info->value->type = g_strdup("unknown");
2260 info->value->locked = bdrv_dev_is_medium_locked(bs);
2261 info->value->removable = bdrv_dev_has_removable_media(bs);
2262
2263 if (bdrv_dev_has_removable_media(bs)) {
2264 info->value->has_tray_open = true;
2265 info->value->tray_open = bdrv_dev_is_tray_open(bs);
2266 }
2267
2268 if (bdrv_iostatus_is_enabled(bs)) {
2269 info->value->has_io_status = true;
2270 info->value->io_status = bs->iostatus;
2271 }
2272
2273 if (bs->drv) {
2274 info->value->has_inserted = true;
2275 info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2276 info->value->inserted->file = g_strdup(bs->filename);
2277 info->value->inserted->ro = bs->read_only;
2278 info->value->inserted->drv = g_strdup(bs->drv->format_name);
2279 info->value->inserted->encrypted = bs->encrypted;
2280 if (bs->backing_file[0]) {
2281 info->value->inserted->has_backing_file = true;
2282 info->value->inserted->backing_file = g_strdup(bs->backing_file);
2283 }
2284
2285 if (bs->io_limits_enabled) {
2286 info->value->inserted->bps =
2287 bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2288 info->value->inserted->bps_rd =
2289 bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2290 info->value->inserted->bps_wr =
2291 bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2292 info->value->inserted->iops =
2293 bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2294 info->value->inserted->iops_rd =
2295 bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2296 info->value->inserted->iops_wr =
2297 bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2298 }
2299 }
2300
2301 /* XXX: waiting for the qapi to support GSList */
2302 if (!cur_item) {
2303 head = cur_item = info;
2304 } else {
2305 cur_item->next = info;
2306 cur_item = info;
2307 }
2308 }
2309
2310 return head;
2311 }
2312
2313 /* Consider exposing this as a full fledged QMP command */
2314 static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2315 {
2316 BlockStats *s;
2317
2318 s = g_malloc0(sizeof(*s));
2319
2320 if (bs->device_name[0]) {
2321 s->has_device = true;
2322 s->device = g_strdup(bs->device_name);
2323 }
2324
2325 s->stats = g_malloc0(sizeof(*s->stats));
2326 s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2327 s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2328 s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2329 s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2330 s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2331 s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2332 s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2333 s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2334 s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2335
2336 if (bs->file) {
2337 s->has_parent = true;
2338 s->parent = qmp_query_blockstat(bs->file, NULL);
2339 }
2340
2341 return s;
2342 }
2343
2344 BlockStatsList *qmp_query_blockstats(Error **errp)
2345 {
2346 BlockStatsList *head = NULL, *cur_item = NULL;
2347 BlockDriverState *bs;
2348
2349 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2350 BlockStatsList *info = g_malloc0(sizeof(*info));
2351 info->value = qmp_query_blockstat(bs, NULL);
2352
2353 /* XXX: waiting for the qapi to support GSList */
2354 if (!cur_item) {
2355 head = cur_item = info;
2356 } else {
2357 cur_item->next = info;
2358 cur_item = info;
2359 }
2360 }
2361
2362 return head;
2363 }
2364
2365 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2366 {
2367 if (bs->backing_hd && bs->backing_hd->encrypted)
2368 return bs->backing_file;
2369 else if (bs->encrypted)
2370 return bs->filename;
2371 else
2372 return NULL;
2373 }
2374
2375 void bdrv_get_backing_filename(BlockDriverState *bs,
2376 char *filename, int filename_size)
2377 {
2378 pstrcpy(filename, filename_size, bs->backing_file);
2379 }
2380
2381 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2382 const uint8_t *buf, int nb_sectors)
2383 {
2384 BlockDriver *drv = bs->drv;
2385 if (!drv)
2386 return -ENOMEDIUM;
2387 if (!drv->bdrv_write_compressed)
2388 return -ENOTSUP;
2389 if (bdrv_check_request(bs, sector_num, nb_sectors))
2390 return -EIO;
2391
2392 if (bs->dirty_bitmap) {
2393 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2394 }
2395
2396 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2397 }
2398
2399 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2400 {
2401 BlockDriver *drv = bs->drv;
2402 if (!drv)
2403 return -ENOMEDIUM;
2404 if (!drv->bdrv_get_info)
2405 return -ENOTSUP;
2406 memset(bdi, 0, sizeof(*bdi));
2407 return drv->bdrv_get_info(bs, bdi);
2408 }
2409
2410 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2411 int64_t pos, int size)
2412 {
2413 BlockDriver *drv = bs->drv;
2414 if (!drv)
2415 return -ENOMEDIUM;
2416 if (drv->bdrv_save_vmstate)
2417 return drv->bdrv_save_vmstate(bs, buf, pos, size);
2418 if (bs->file)
2419 return bdrv_save_vmstate(bs->file, buf, pos, size);
2420 return -ENOTSUP;
2421 }
2422
2423 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2424 int64_t pos, int size)
2425 {
2426 BlockDriver *drv = bs->drv;
2427 if (!drv)
2428 return -ENOMEDIUM;
2429 if (drv->bdrv_load_vmstate)
2430 return drv->bdrv_load_vmstate(bs, buf, pos, size);
2431 if (bs->file)
2432 return bdrv_load_vmstate(bs->file, buf, pos, size);
2433 return -ENOTSUP;
2434 }
2435
2436 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2437 {
2438 BlockDriver *drv = bs->drv;
2439
2440 if (!drv || !drv->bdrv_debug_event) {
2441 return;
2442 }
2443
2444 return drv->bdrv_debug_event(bs, event);
2445
2446 }
2447
2448 /**************************************************************/
2449 /* handling of snapshots */
2450
2451 int bdrv_can_snapshot(BlockDriverState *bs)
2452 {
2453 BlockDriver *drv = bs->drv;
2454 if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2455 return 0;
2456 }
2457
2458 if (!drv->bdrv_snapshot_create) {
2459 if (bs->file != NULL) {
2460 return bdrv_can_snapshot(bs->file);
2461 }
2462 return 0;
2463 }
2464
2465 return 1;
2466 }
2467
2468 int bdrv_is_snapshot(BlockDriverState *bs)
2469 {
2470 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2471 }
2472
2473 BlockDriverState *bdrv_snapshots(void)
2474 {
2475 BlockDriverState *bs;
2476
2477 if (bs_snapshots) {
2478 return bs_snapshots;
2479 }
2480
2481 bs = NULL;
2482 while ((bs = bdrv_next(bs))) {
2483 if (bdrv_can_snapshot(bs)) {
2484 bs_snapshots = bs;
2485 return bs;
2486 }
2487 }
2488 return NULL;
2489 }
2490
2491 int bdrv_snapshot_create(BlockDriverState *bs,
2492 QEMUSnapshotInfo *sn_info)
2493 {
2494 BlockDriver *drv = bs->drv;
2495 if (!drv)
2496 return -ENOMEDIUM;
2497 if (drv->bdrv_snapshot_create)
2498 return drv->bdrv_snapshot_create(bs, sn_info);
2499 if (bs->file)
2500 return bdrv_snapshot_create(bs->file, sn_info);
2501 return -ENOTSUP;
2502 }
2503
2504 int bdrv_snapshot_goto(BlockDriverState *bs,
2505 const char *snapshot_id)
2506 {
2507 BlockDriver *drv = bs->drv;
2508 int ret, open_ret;
2509
2510 if (!drv)
2511 return -ENOMEDIUM;
2512 if (drv->bdrv_snapshot_goto)
2513 return drv->bdrv_snapshot_goto(bs, snapshot_id);
2514
2515 if (bs->file) {
2516 drv->bdrv_close(bs);
2517 ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2518 open_ret = drv->bdrv_open(bs, bs->open_flags);
2519 if (open_ret < 0) {
2520 bdrv_delete(bs->file);
2521 bs->drv = NULL;
2522 return open_ret;
2523 }
2524 return ret;
2525 }
2526
2527 return -ENOTSUP;
2528 }
2529
2530 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2531 {
2532 BlockDriver *drv = bs->drv;
2533 if (!drv)
2534 return -ENOMEDIUM;
2535 if (drv->bdrv_snapshot_delete)
2536 return drv->bdrv_snapshot_delete(bs, snapshot_id);
2537 if (bs->file)
2538 return bdrv_snapshot_delete(bs->file, snapshot_id);
2539 return -ENOTSUP;
2540 }
2541
2542 int bdrv_snapshot_list(BlockDriverState *bs,
2543 QEMUSnapshotInfo **psn_info)
2544 {
2545 BlockDriver *drv = bs->drv;
2546 if (!drv)
2547 return -ENOMEDIUM;
2548 if (drv->bdrv_snapshot_list)
2549 return drv->bdrv_snapshot_list(bs, psn_info);
2550 if (bs->file)
2551 return bdrv_snapshot_list(bs->file, psn_info);
2552 return -ENOTSUP;
2553 }
2554
2555 int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2556 const char *snapshot_name)
2557 {
2558 BlockDriver *drv = bs->drv;
2559 if (!drv) {
2560 return -ENOMEDIUM;
2561 }
2562 if (!bs->read_only) {
2563 return -EINVAL;
2564 }
2565 if (drv->bdrv_snapshot_load_tmp) {
2566 return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2567 }
2568 return -ENOTSUP;
2569 }
2570
2571 #define NB_SUFFIXES 4
2572
2573 char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2574 {
2575 static const char suffixes[NB_SUFFIXES] = "KMGT";
2576 int64_t base;
2577 int i;
2578
2579 if (size <= 999) {
2580 snprintf(buf, buf_size, "%" PRId64, size);
2581 } else {
2582 base = 1024;
2583 for(i = 0; i < NB_SUFFIXES; i++) {
2584 if (size < (10 * base)) {
2585 snprintf(buf, buf_size, "%0.1f%c",
2586 (double)size / base,
2587 suffixes[i]);
2588 break;
2589 } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2590 snprintf(buf, buf_size, "%" PRId64 "%c",
2591 ((size + (base >> 1)) / base),
2592 suffixes[i]);
2593 break;
2594 }
2595 base = base * 1024;
2596 }
2597 }
2598 return buf;
2599 }
2600
2601 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2602 {
2603 char buf1[128], date_buf[128], clock_buf[128];
2604 #ifdef _WIN32
2605 struct tm *ptm;
2606 #else
2607 struct tm tm;
2608 #endif
2609 time_t ti;
2610 int64_t secs;
2611
2612 if (!sn) {
2613 snprintf(buf, buf_size,
2614 "%-10s%-20s%7s%20s%15s",
2615 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2616 } else {
2617 ti = sn->date_sec;
2618 #ifdef _WIN32
2619 ptm = localtime(&ti);
2620 strftime(date_buf, sizeof(date_buf),
2621 "%Y-%m-%d %H:%M:%S", ptm);
2622 #else
2623 localtime_r(&ti, &tm);
2624 strftime(date_buf, sizeof(date_buf),
2625 "%Y-%m-%d %H:%M:%S", &tm);
2626 #endif
2627 secs = sn->vm_clock_nsec / 1000000000;
2628 snprintf(clock_buf, sizeof(clock_buf),
2629 "%02d:%02d:%02d.%03d",
2630 (int)(secs / 3600),
2631 (int)((secs / 60) % 60),
2632 (int)(secs % 60),
2633 (int)((sn->vm_clock_nsec / 1000000) % 1000));
2634 snprintf(buf, buf_size,
2635 "%-10s%-20s%7s%20s%15s",
2636 sn->id_str, sn->name,
2637 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2638 date_buf,
2639 clock_buf);
2640 }
2641 return buf;
2642 }
2643
2644 /**************************************************************/
2645 /* async I/Os */
2646
2647 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
2648 QEMUIOVector *qiov, int nb_sectors,
2649 BlockDriverCompletionFunc *cb, void *opaque)
2650 {
2651 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2652
2653 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2654 cb, opaque, false);
2655 }
2656
2657 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2658 QEMUIOVector *qiov, int nb_sectors,
2659 BlockDriverCompletionFunc *cb, void *opaque)
2660 {
2661 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2662
2663 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2664 cb, opaque, true);
2665 }
2666
2667
2668 typedef struct MultiwriteCB {
2669 int error;
2670 int num_requests;
2671 int num_callbacks;
2672 struct {
2673 BlockDriverCompletionFunc *cb;
2674 void *opaque;
2675 QEMUIOVector *free_qiov;
2676 void *free_buf;
2677 } callbacks[];
2678 } MultiwriteCB;
2679
2680 static void multiwrite_user_cb(MultiwriteCB *mcb)
2681 {
2682 int i;
2683
2684 for (i = 0; i < mcb->num_callbacks; i++) {
2685 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
2686 if (mcb->callbacks[i].free_qiov) {
2687 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2688 }
2689 g_free(mcb->callbacks[i].free_qiov);
2690 qemu_vfree(mcb->callbacks[i].free_buf);
2691 }
2692 }
2693
2694 static void multiwrite_cb(void *opaque, int ret)
2695 {
2696 MultiwriteCB *mcb = opaque;
2697
2698 trace_multiwrite_cb(mcb, ret);
2699
2700 if (ret < 0 && !mcb->error) {
2701 mcb->error = ret;
2702 }
2703
2704 mcb->num_requests--;
2705 if (mcb->num_requests == 0) {
2706 multiwrite_user_cb(mcb);
2707 g_free(mcb);
2708 }
2709 }
2710
2711 static int multiwrite_req_compare(const void *a, const void *b)
2712 {
2713 const BlockRequest *req1 = a, *req2 = b;
2714
2715 /*
2716 * Note that we can't simply subtract req2->sector from req1->sector
2717 * here as that could overflow the return value.
2718 */
2719 if (req1->sector > req2->sector) {
2720 return 1;
2721 } else if (req1->sector < req2->sector) {
2722 return -1;
2723 } else {
2724 return 0;
2725 }
2726 }
2727
2728 /*
2729 * Takes a bunch of requests and tries to merge them. Returns the number of
2730 * requests that remain after merging.
2731 */
2732 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
2733 int num_reqs, MultiwriteCB *mcb)
2734 {
2735 int i, outidx;
2736
2737 // Sort requests by start sector
2738 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
2739
2740 // Check if adjacent requests touch the same clusters. If so, combine them,
2741 // filling up gaps with zero sectors.
2742 outidx = 0;
2743 for (i = 1; i < num_reqs; i++) {
2744 int merge = 0;
2745 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
2746
2747 // This handles the cases that are valid for all block drivers, namely
2748 // exactly sequential writes and overlapping writes.
2749 if (reqs[i].sector <= oldreq_last) {
2750 merge = 1;
2751 }
2752
2753 // The block driver may decide that it makes sense to combine requests
2754 // even if there is a gap of some sectors between them. In this case,
2755 // the gap is filled with zeros (therefore only applicable for yet
2756 // unused space in format like qcow2).
2757 if (!merge && bs->drv->bdrv_merge_requests) {
2758 merge = bs->drv->bdrv_merge_requests(bs, &reqs[outidx], &reqs[i]);
2759 }
2760
2761 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
2762 merge = 0;
2763 }
2764
2765 if (merge) {
2766 size_t size;
2767 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
2768 qemu_iovec_init(qiov,
2769 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
2770
2771 // Add the first request to the merged one. If the requests are
2772 // overlapping, drop the last sectors of the first request.
2773 size = (reqs[i].sector - reqs[outidx].sector) << 9;
2774 qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
2775
2776 // We might need to add some zeros between the two requests
2777 if (reqs[i].sector > oldreq_last) {
2778 size_t zero_bytes = (reqs[i].sector - oldreq_last) << 9;
2779 uint8_t *buf = qemu_blockalign(bs, zero_bytes);
2780 memset(buf, 0, zero_bytes);
2781 qemu_iovec_add(qiov, buf, zero_bytes);
2782 mcb->callbacks[i].free_buf = buf;
2783 }
2784
2785 // Add the second request
2786 qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
2787
2788 reqs[outidx].nb_sectors = qiov->size >> 9;
2789 reqs[outidx].qiov = qiov;
2790
2791 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
2792 } else {
2793 outidx++;
2794 reqs[outidx].sector = reqs[i].sector;
2795 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
2796 reqs[outidx].qiov = reqs[i].qiov;
2797 }
2798 }
2799
2800 return outidx + 1;
2801 }
2802
2803 /*
2804 * Submit multiple AIO write requests at once.
2805 *
2806 * On success, the function returns 0 and all requests in the reqs array have
2807 * been submitted. In error case this function returns -1, and any of the
2808 * requests may or may not be submitted yet. In particular, this means that the
2809 * callback will be called for some of the requests, for others it won't. The
2810 * caller must check the error field of the BlockRequest to wait for the right
2811 * callbacks (if error != 0, no callback will be called).
2812 *
2813 * The implementation may modify the contents of the reqs array, e.g. to merge
2814 * requests. However, the fields opaque and error are left unmodified as they
2815 * are used to signal failure for a single request to the caller.
2816 */
2817 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
2818 {
2819 MultiwriteCB *mcb;
2820 int i;
2821
2822 /* don't submit writes if we don't have a medium */
2823 if (bs->drv == NULL) {
2824 for (i = 0; i < num_reqs; i++) {
2825 reqs[i].error = -ENOMEDIUM;
2826 }
2827 return -1;
2828 }
2829
2830 if (num_reqs == 0) {
2831 return 0;
2832 }
2833
2834 // Create MultiwriteCB structure
2835 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
2836 mcb->num_requests = 0;
2837 mcb->num_callbacks = num_reqs;
2838
2839 for (i = 0; i < num_reqs; i++) {
2840 mcb->callbacks[i].cb = reqs[i].cb;
2841 mcb->callbacks[i].opaque = reqs[i].opaque;
2842 }
2843
2844 // Check for mergable requests
2845 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
2846
2847 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
2848
2849 /* Run the aio requests. */
2850 mcb->num_requests = num_reqs;
2851 for (i = 0; i < num_reqs; i++) {
2852 bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
2853 reqs[i].nb_sectors, multiwrite_cb, mcb);
2854 }
2855
2856 return 0;
2857 }
2858
2859 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
2860 {
2861 acb->pool->cancel(acb);
2862 }
2863
2864 /* block I/O throttling */
2865 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
2866 bool is_write, double elapsed_time, uint64_t *wait)
2867 {
2868 uint64_t bps_limit = 0;
2869 double bytes_limit, bytes_base, bytes_res;
2870 double slice_time, wait_time;
2871
2872 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
2873 bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2874 } else if (bs->io_limits.bps[is_write]) {
2875 bps_limit = bs->io_limits.bps[is_write];
2876 } else {
2877 if (wait) {
2878 *wait = 0;
2879 }
2880
2881 return false;
2882 }
2883
2884 slice_time = bs->slice_end - bs->slice_start;
2885 slice_time /= (NANOSECONDS_PER_SECOND);
2886 bytes_limit = bps_limit * slice_time;
2887 bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
2888 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
2889 bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
2890 }
2891
2892 /* bytes_base: the bytes of data which have been read/written; and
2893 * it is obtained from the history statistic info.
2894 * bytes_res: the remaining bytes of data which need to be read/written.
2895 * (bytes_base + bytes_res) / bps_limit: used to calcuate
2896 * the total time for completing reading/writting all data.
2897 */
2898 bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
2899
2900 if (bytes_base + bytes_res <= bytes_limit) {
2901 if (wait) {
2902 *wait = 0;
2903 }
2904
2905 return false;
2906 }
2907
2908 /* Calc approx time to dispatch */
2909 wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
2910
2911 /* When the I/O rate at runtime exceeds the limits,
2912 * bs->slice_end need to be extended in order that the current statistic
2913 * info can be kept until the timer fire, so it is increased and tuned
2914 * based on the result of experiment.
2915 */
2916 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
2917 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
2918 if (wait) {
2919 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
2920 }
2921
2922 return true;
2923 }
2924
2925 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
2926 double elapsed_time, uint64_t *wait)
2927 {
2928 uint64_t iops_limit = 0;
2929 double ios_limit, ios_base;
2930 double slice_time, wait_time;
2931
2932 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
2933 iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2934 } else if (bs->io_limits.iops[is_write]) {
2935 iops_limit = bs->io_limits.iops[is_write];
2936 } else {
2937 if (wait) {
2938 *wait = 0;
2939 }
2940
2941 return false;
2942 }
2943
2944 slice_time = bs->slice_end - bs->slice_start;
2945 slice_time /= (NANOSECONDS_PER_SECOND);
2946 ios_limit = iops_limit * slice_time;
2947 ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
2948 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
2949 ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
2950 }
2951
2952 if (ios_base + 1 <= ios_limit) {
2953 if (wait) {
2954 *wait = 0;
2955 }
2956
2957 return false;
2958 }
2959
2960 /* Calc approx time to dispatch */
2961 wait_time = (ios_base + 1) / iops_limit;
2962 if (wait_time > elapsed_time) {
2963 wait_time = wait_time - elapsed_time;
2964 } else {
2965 wait_time = 0;
2966 }
2967
2968 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
2969 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
2970 if (wait) {
2971 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
2972 }
2973
2974 return true;
2975 }
2976
2977 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
2978 bool is_write, int64_t *wait)
2979 {
2980 int64_t now, max_wait;
2981 uint64_t bps_wait = 0, iops_wait = 0;
2982 double elapsed_time;
2983 int bps_ret, iops_ret;
2984
2985 now = qemu_get_clock_ns(vm_clock);
2986 if ((bs->slice_start < now)
2987 && (bs->slice_end > now)) {
2988 bs->slice_end = now + bs->slice_time;
2989 } else {
2990 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
2991 bs->slice_start = now;
2992 bs->slice_end = now + bs->slice_time;
2993
2994 bs->io_base.bytes[is_write] = bs->nr_bytes[is_write];
2995 bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
2996
2997 bs->io_base.ios[is_write] = bs->nr_ops[is_write];
2998 bs->io_base.ios[!is_write] = bs->nr_ops[!is_write];
2999 }
3000
3001 elapsed_time = now - bs->slice_start;
3002 elapsed_time /= (NANOSECONDS_PER_SECOND);
3003
3004 bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors,
3005 is_write, elapsed_time, &bps_wait);
3006 iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3007 elapsed_time, &iops_wait);
3008 if (bps_ret || iops_ret) {
3009 max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3010 if (wait) {
3011 *wait = max_wait;
3012 }
3013
3014 now = qemu_get_clock_ns(vm_clock);
3015 if (bs->slice_end < now + max_wait) {
3016 bs->slice_end = now + max_wait;
3017 }
3018
3019 return true;
3020 }
3021
3022 if (wait) {
3023 *wait = 0;
3024 }
3025
3026 return false;
3027 }
3028
3029 /**************************************************************/
3030 /* async block device emulation */
3031
3032 typedef struct BlockDriverAIOCBSync {
3033 BlockDriverAIOCB common;
3034 QEMUBH *bh;
3035 int ret;
3036 /* vector translation state */
3037 QEMUIOVector *qiov;
3038 uint8_t *bounce;
3039 int is_write;
3040 } BlockDriverAIOCBSync;
3041
3042 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3043 {
3044 BlockDriverAIOCBSync *acb =
3045 container_of(blockacb, BlockDriverAIOCBSync, common);
3046 qemu_bh_delete(acb->bh);
3047 acb->bh = NULL;
3048 qemu_aio_release(acb);
3049 }
3050
3051 static AIOPool bdrv_em_aio_pool = {
3052 .aiocb_size = sizeof(BlockDriverAIOCBSync),
3053 .cancel = bdrv_aio_cancel_em,
3054 };
3055
3056 static void bdrv_aio_bh_cb(void *opaque)
3057 {
3058 BlockDriverAIOCBSync *acb = opaque;
3059
3060 if (!acb->is_write)
3061 qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
3062 qemu_vfree(acb->bounce);
3063 acb->common.cb(acb->common.opaque, acb->ret);
3064 qemu_bh_delete(acb->bh);
3065 acb->bh = NULL;
3066 qemu_aio_release(acb);
3067 }
3068
3069 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3070 int64_t sector_num,
3071 QEMUIOVector *qiov,
3072 int nb_sectors,
3073 BlockDriverCompletionFunc *cb,
3074 void *opaque,
3075 int is_write)
3076
3077 {
3078 BlockDriverAIOCBSync *acb;
3079
3080 acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
3081 acb->is_write = is_write;
3082 acb->qiov = qiov;
3083 acb->bounce = qemu_blockalign(bs, qiov->size);
3084 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3085
3086 if (is_write) {
3087 qemu_iovec_to_buffer(acb->qiov, acb->bounce);
3088 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3089 } else {
3090 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3091 }
3092
3093 qemu_bh_schedule(acb->bh);
3094
3095 return &acb->common;
3096 }
3097
3098 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3099 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3100 BlockDriverCompletionFunc *cb, void *opaque)
3101 {
3102 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3103 }
3104
3105 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3106 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3107 BlockDriverCompletionFunc *cb, void *opaque)
3108 {
3109 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3110 }
3111
3112
3113 typedef struct BlockDriverAIOCBCoroutine {
3114 BlockDriverAIOCB common;
3115 BlockRequest req;
3116 bool is_write;
3117 QEMUBH* bh;
3118 } BlockDriverAIOCBCoroutine;
3119
3120 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3121 {
3122 qemu_aio_flush();
3123 }
3124
3125 static AIOPool bdrv_em_co_aio_pool = {
3126 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
3127 .cancel = bdrv_aio_co_cancel_em,
3128 };
3129
3130 static void bdrv_co_em_bh(void *opaque)
3131 {
3132 BlockDriverAIOCBCoroutine *acb = opaque;
3133
3134 acb->common.cb(acb->common.opaque, acb->req.error);
3135 qemu_bh_delete(acb->bh);
3136 qemu_aio_release(acb);
3137 }
3138
3139 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3140 static void coroutine_fn bdrv_co_do_rw(void *opaque)
3141 {
3142 BlockDriverAIOCBCoroutine *acb = opaque;
3143 BlockDriverState *bs = acb->common.bs;
3144
3145 if (!acb->is_write) {
3146 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3147 acb->req.nb_sectors, acb->req.qiov);
3148 } else {
3149 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3150 acb->req.nb_sectors, acb->req.qiov);
3151 }
3152
3153 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3154 qemu_bh_schedule(acb->bh);
3155 }
3156
3157 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3158 int64_t sector_num,
3159 QEMUIOVector *qiov,
3160 int nb_sectors,
3161 BlockDriverCompletionFunc *cb,
3162 void *opaque,
3163 bool is_write)
3164 {
3165 Coroutine *co;
3166 BlockDriverAIOCBCoroutine *acb;
3167
3168 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3169 acb->req.sector = sector_num;
3170 acb->req.nb_sectors = nb_sectors;
3171 acb->req.qiov = qiov;
3172 acb->is_write = is_write;
3173
3174 co = qemu_coroutine_create(bdrv_co_do_rw);
3175 qemu_coroutine_enter(co, acb);
3176
3177 return &acb->common;
3178 }
3179
3180 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3181 {
3182 BlockDriverAIOCBCoroutine *acb = opaque;
3183 BlockDriverState *bs = acb->common.bs;
3184
3185 acb->req.error = bdrv_co_flush(bs);
3186 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3187 qemu_bh_schedule(acb->bh);
3188 }
3189
3190 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3191 BlockDriverCompletionFunc *cb, void *opaque)
3192 {
3193 trace_bdrv_aio_flush(bs, opaque);
3194
3195 Coroutine *co;
3196 BlockDriverAIOCBCoroutine *acb;
3197
3198 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3199 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3200 qemu_coroutine_enter(co, acb);
3201
3202 return &acb->common;
3203 }
3204
3205 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3206 {
3207 BlockDriverAIOCBCoroutine *acb = opaque;
3208 BlockDriverState *bs = acb->common.bs;
3209
3210 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3211 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3212 qemu_bh_schedule(acb->bh);
3213 }
3214
3215 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3216 int64_t sector_num, int nb_sectors,
3217 BlockDriverCompletionFunc *cb, void *opaque)
3218 {
3219 Coroutine *co;
3220 BlockDriverAIOCBCoroutine *acb;
3221
3222 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3223
3224 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3225 acb->req.sector = sector_num;
3226 acb->req.nb_sectors = nb_sectors;
3227 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3228 qemu_coroutine_enter(co, acb);
3229
3230 return &acb->common;
3231 }
3232
3233 void bdrv_init(void)
3234 {
3235 module_call_init(MODULE_INIT_BLOCK);
3236 }
3237
3238 void bdrv_init_with_whitelist(void)
3239 {
3240 use_bdrv_whitelist = 1;
3241 bdrv_init();
3242 }
3243
3244 void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3245 BlockDriverCompletionFunc *cb, void *opaque)
3246 {
3247 BlockDriverAIOCB *acb;
3248
3249 if (pool->free_aiocb) {
3250 acb = pool->free_aiocb;
3251 pool->free_aiocb = acb->next;
3252 } else {
3253 acb = g_malloc0(pool->aiocb_size);
3254 acb->pool = pool;
3255 }
3256 acb->bs = bs;
3257 acb->cb = cb;
3258 acb->opaque = opaque;
3259 return acb;
3260 }
3261
3262 void qemu_aio_release(void *p)
3263 {
3264 BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3265 AIOPool *pool = acb->pool;
3266 acb->next = pool->free_aiocb;
3267 pool->free_aiocb = acb;
3268 }
3269
3270 /**************************************************************/
3271 /* Coroutine block device emulation */
3272
3273 typedef struct CoroutineIOCompletion {
3274 Coroutine *coroutine;
3275 int ret;
3276 } CoroutineIOCompletion;
3277
3278 static void bdrv_co_io_em_complete(void *opaque, int ret)
3279 {
3280 CoroutineIOCompletion *co = opaque;
3281
3282 co->ret = ret;
3283 qemu_coroutine_enter(co->coroutine, NULL);
3284 }
3285
3286 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3287 int nb_sectors, QEMUIOVector *iov,
3288 bool is_write)
3289 {
3290 CoroutineIOCompletion co = {
3291 .coroutine = qemu_coroutine_self(),
3292 };
3293 BlockDriverAIOCB *acb;
3294
3295 if (is_write) {
3296 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3297 bdrv_co_io_em_complete, &co);
3298 } else {
3299 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3300 bdrv_co_io_em_complete, &co);
3301 }
3302
3303 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3304 if (!acb) {
3305 return -EIO;
3306 }
3307 qemu_coroutine_yield();
3308
3309 return co.ret;
3310 }
3311
3312 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3313 int64_t sector_num, int nb_sectors,
3314 QEMUIOVector *iov)
3315 {
3316 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3317 }
3318
3319 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3320 int64_t sector_num, int nb_sectors,
3321 QEMUIOVector *iov)
3322 {
3323 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3324 }
3325
3326 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3327 {
3328 RwCo *rwco = opaque;
3329
3330 rwco->ret = bdrv_co_flush(rwco->bs);
3331 }
3332
3333 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3334 {
3335 int ret;
3336
3337 if (!bs->drv) {
3338 return 0;
3339 }
3340
3341 /* Write back cached data to the OS even with cache=unsafe */
3342 if (bs->drv->bdrv_co_flush_to_os) {
3343 ret = bs->drv->bdrv_co_flush_to_os(bs);
3344 if (ret < 0) {
3345 return ret;
3346 }
3347 }
3348
3349 /* But don't actually force it to the disk with cache=unsafe */
3350 if (bs->open_flags & BDRV_O_NO_FLUSH) {
3351 return 0;
3352 }
3353
3354 if (bs->drv->bdrv_co_flush_to_disk) {
3355 return bs->drv->bdrv_co_flush_to_disk(bs);
3356 } else if (bs->drv->bdrv_aio_flush) {
3357 BlockDriverAIOCB *acb;
3358 CoroutineIOCompletion co = {
3359 .coroutine = qemu_coroutine_self(),
3360 };
3361
3362 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3363 if (acb == NULL) {
3364 return -EIO;
3365 } else {
3366 qemu_coroutine_yield();
3367 return co.ret;
3368 }
3369 } else {
3370 /*
3371 * Some block drivers always operate in either writethrough or unsafe
3372 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3373 * know how the server works (because the behaviour is hardcoded or
3374 * depends on server-side configuration), so we can't ensure that
3375 * everything is safe on disk. Returning an error doesn't work because
3376 * that would break guests even if the server operates in writethrough
3377 * mode.
3378 *
3379 * Let's hope the user knows what he's doing.
3380 */
3381 return 0;
3382 }
3383 }
3384
3385 void bdrv_invalidate_cache(BlockDriverState *bs)
3386 {
3387 if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3388 bs->drv->bdrv_invalidate_cache(bs);
3389 }
3390 }
3391
3392 void bdrv_invalidate_cache_all(void)
3393 {
3394 BlockDriverState *bs;
3395
3396 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3397 bdrv_invalidate_cache(bs);
3398 }
3399 }
3400
3401 int bdrv_flush(BlockDriverState *bs)
3402 {
3403 Coroutine *co;
3404 RwCo rwco = {
3405 .bs = bs,
3406 .ret = NOT_DONE,
3407 };
3408
3409 if (qemu_in_coroutine()) {
3410 /* Fast-path if already in coroutine context */
3411 bdrv_flush_co_entry(&rwco);
3412 } else {
3413 co = qemu_coroutine_create(bdrv_flush_co_entry);
3414 qemu_coroutine_enter(co, &rwco);
3415 while (rwco.ret == NOT_DONE) {
3416 qemu_aio_wait();
3417 }
3418 }
3419
3420 return rwco.ret;
3421 }
3422
3423 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3424 {
3425 RwCo *rwco = opaque;
3426
3427 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3428 }
3429
3430 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3431 int nb_sectors)
3432 {
3433 if (!bs->drv) {
3434 return -ENOMEDIUM;
3435 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3436 return -EIO;
3437 } else if (bs->read_only) {
3438 return -EROFS;
3439 } else if (bs->drv->bdrv_co_discard) {
3440 return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3441 } else if (bs->drv->bdrv_aio_discard) {
3442 BlockDriverAIOCB *acb;
3443 CoroutineIOCompletion co = {
3444 .coroutine = qemu_coroutine_self(),
3445 };
3446
3447 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3448 bdrv_co_io_em_complete, &co);
3449 if (acb == NULL) {
3450 return -EIO;
3451 } else {
3452 qemu_coroutine_yield();
3453 return co.ret;
3454 }
3455 } else {
3456 return 0;
3457 }
3458 }
3459
3460 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3461 {
3462 Coroutine *co;
3463 RwCo rwco = {
3464 .bs = bs,
3465 .sector_num = sector_num,
3466 .nb_sectors = nb_sectors,
3467 .ret = NOT_DONE,
3468 };
3469
3470 if (qemu_in_coroutine()) {
3471 /* Fast-path if already in coroutine context */
3472 bdrv_discard_co_entry(&rwco);
3473 } else {
3474 co = qemu_coroutine_create(bdrv_discard_co_entry);
3475 qemu_coroutine_enter(co, &rwco);
3476 while (rwco.ret == NOT_DONE) {
3477 qemu_aio_wait();
3478 }
3479 }
3480
3481 return rwco.ret;
3482 }
3483
3484 /**************************************************************/
3485 /* removable device support */
3486
3487 /**
3488 * Return TRUE if the media is present
3489 */
3490 int bdrv_is_inserted(BlockDriverState *bs)
3491 {
3492 BlockDriver *drv = bs->drv;
3493
3494 if (!drv)
3495 return 0;
3496 if (!drv->bdrv_is_inserted)
3497 return 1;
3498 return drv->bdrv_is_inserted(bs);
3499 }
3500
3501 /**
3502 * Return whether the media changed since the last call to this
3503 * function, or -ENOTSUP if we don't know. Most drivers don't know.
3504 */
3505 int bdrv_media_changed(BlockDriverState *bs)
3506 {
3507 BlockDriver *drv = bs->drv;
3508
3509 if (drv && drv->bdrv_media_changed) {
3510 return drv->bdrv_media_changed(bs);
3511 }
3512 return -ENOTSUP;
3513 }
3514
3515 /**
3516 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3517 */
3518 void bdrv_eject(BlockDriverState *bs, int eject_flag)
3519 {
3520 BlockDriver *drv = bs->drv;
3521
3522 if (drv && drv->bdrv_eject) {
3523 drv->bdrv_eject(bs, eject_flag);
3524 }
3525 }
3526
3527 /**
3528 * Lock or unlock the media (if it is locked, the user won't be able
3529 * to eject it manually).
3530 */
3531 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3532 {
3533 BlockDriver *drv = bs->drv;
3534
3535 trace_bdrv_lock_medium(bs, locked);
3536
3537 if (drv && drv->bdrv_lock_medium) {
3538 drv->bdrv_lock_medium(bs, locked);
3539 }
3540 }
3541
3542 /* needed for generic scsi interface */
3543
3544 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3545 {
3546 BlockDriver *drv = bs->drv;
3547
3548 if (drv && drv->bdrv_ioctl)
3549 return drv->bdrv_ioctl(bs, req, buf);
3550 return -ENOTSUP;
3551 }
3552
3553 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3554 unsigned long int req, void *buf,
3555 BlockDriverCompletionFunc *cb, void *opaque)
3556 {
3557 BlockDriver *drv = bs->drv;
3558
3559 if (drv && drv->bdrv_aio_ioctl)
3560 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3561 return NULL;
3562 }
3563
3564 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3565 {
3566 bs->buffer_alignment = align;
3567 }
3568
3569 void *qemu_blockalign(BlockDriverState *bs, size_t size)
3570 {
3571 return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3572 }
3573
3574 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3575 {
3576 int64_t bitmap_size;
3577
3578 bs->dirty_count = 0;
3579 if (enable) {
3580 if (!bs->dirty_bitmap) {
3581 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3582 BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
3583 bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
3584
3585 bs->dirty_bitmap = g_malloc0(bitmap_size);
3586 }
3587 } else {
3588 if (bs->dirty_bitmap) {
3589 g_free(bs->dirty_bitmap);
3590 bs->dirty_bitmap = NULL;
3591 }
3592 }
3593 }
3594
3595 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3596 {
3597 int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
3598
3599 if (bs->dirty_bitmap &&
3600 (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
3601 return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3602 (1UL << (chunk % (sizeof(unsigned long) * 8))));
3603 } else {
3604 return 0;
3605 }
3606 }
3607
3608 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3609 int nr_sectors)
3610 {
3611 set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3612 }
3613
3614 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3615 {
3616 return bs->dirty_count;
3617 }
3618
3619 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3620 {
3621 assert(bs->in_use != in_use);
3622 bs->in_use = in_use;
3623 }
3624
3625 int bdrv_in_use(BlockDriverState *bs)
3626 {
3627 return bs->in_use;
3628 }
3629
3630 void bdrv_iostatus_enable(BlockDriverState *bs)
3631 {
3632 bs->iostatus_enabled = true;
3633 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3634 }
3635
3636 /* The I/O status is only enabled if the drive explicitly
3637 * enables it _and_ the VM is configured to stop on errors */
3638 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3639 {
3640 return (bs->iostatus_enabled &&
3641 (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3642 bs->on_write_error == BLOCK_ERR_STOP_ANY ||
3643 bs->on_read_error == BLOCK_ERR_STOP_ANY));
3644 }
3645
3646 void bdrv_iostatus_disable(BlockDriverState *bs)
3647 {
3648 bs->iostatus_enabled = false;
3649 }
3650
3651 void bdrv_iostatus_reset(BlockDriverState *bs)
3652 {
3653 if (bdrv_iostatus_is_enabled(bs)) {
3654 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3655 }
3656 }
3657
3658 /* XXX: Today this is set by device models because it makes the implementation
3659 quite simple. However, the block layer knows about the error, so it's
3660 possible to implement this without device models being involved */
3661 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3662 {
3663 if (bdrv_iostatus_is_enabled(bs) &&
3664 bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
3665 assert(error >= 0);
3666 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
3667 BLOCK_DEVICE_IO_STATUS_FAILED;
3668 }
3669 }
3670
3671 void
3672 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
3673 enum BlockAcctType type)
3674 {
3675 assert(type < BDRV_MAX_IOTYPE);
3676
3677 cookie->bytes = bytes;
3678 cookie->start_time_ns = get_clock();
3679 cookie->type = type;
3680 }
3681
3682 void
3683 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
3684 {
3685 assert(cookie->type < BDRV_MAX_IOTYPE);
3686
3687 bs->nr_bytes[cookie->type] += cookie->bytes;
3688 bs->nr_ops[cookie->type]++;
3689 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
3690 }
3691
3692 int bdrv_img_create(const char *filename, const char *fmt,
3693 const char *base_filename, const char *base_fmt,
3694 char *options, uint64_t img_size, int flags)
3695 {
3696 QEMUOptionParameter *param = NULL, *create_options = NULL;
3697 QEMUOptionParameter *backing_fmt, *backing_file, *size;
3698 BlockDriverState *bs = NULL;
3699 BlockDriver *drv, *proto_drv;
3700 BlockDriver *backing_drv = NULL;
3701 int ret = 0;
3702
3703 /* Find driver and parse its options */
3704 drv = bdrv_find_format(fmt);
3705 if (!drv) {
3706 error_report("Unknown file format '%s'", fmt);
3707 ret = -EINVAL;
3708 goto out;
3709 }
3710
3711 proto_drv = bdrv_find_protocol(filename);
3712 if (!proto_drv) {
3713 error_report("Unknown protocol '%s'", filename);
3714 ret = -EINVAL;
3715 goto out;
3716 }
3717
3718 create_options = append_option_parameters(create_options,
3719 drv->create_options);
3720 create_options = append_option_parameters(create_options,
3721 proto_drv->create_options);
3722
3723 /* Create parameter list with default values */
3724 param = parse_option_parameters("", create_options, param);
3725
3726 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
3727
3728 /* Parse -o options */
3729 if (options) {
3730 param = parse_option_parameters(options, create_options, param);
3731 if (param == NULL) {
3732 error_report("Invalid options for file format '%s'.", fmt);
3733 ret = -EINVAL;
3734 goto out;
3735 }
3736 }
3737
3738 if (base_filename) {
3739 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
3740 base_filename)) {
3741 error_report("Backing file not supported for file format '%s'",
3742 fmt);
3743 ret = -EINVAL;
3744 goto out;
3745 }
3746 }
3747
3748 if (base_fmt) {
3749 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
3750 error_report("Backing file format not supported for file "
3751 "format '%s'", fmt);
3752 ret = -EINVAL;
3753 goto out;
3754 }
3755 }
3756
3757 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
3758 if (backing_file && backing_file->value.s) {
3759 if (!strcmp(filename, backing_file->value.s)) {
3760 error_report("Error: Trying to create an image with the "
3761 "same filename as the backing file");
3762 ret = -EINVAL;
3763 goto out;
3764 }
3765 }
3766
3767 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
3768 if (backing_fmt && backing_fmt->value.s) {
3769 backing_drv = bdrv_find_format(backing_fmt->value.s);
3770 if (!backing_drv) {
3771 error_report("Unknown backing file format '%s'",
3772 backing_fmt->value.s);
3773 ret = -EINVAL;
3774 goto out;
3775 }
3776 }
3777
3778 // The size for the image must always be specified, with one exception:
3779 // If we are using a backing file, we can obtain the size from there
3780 size = get_option_parameter(param, BLOCK_OPT_SIZE);
3781 if (size && size->value.n == -1) {
3782 if (backing_file && backing_file->value.s) {
3783 uint64_t size;
3784 char buf[32];
3785
3786 bs = bdrv_new("");
3787
3788 ret = bdrv_open(bs, backing_file->value.s, flags, backing_drv);
3789 if (ret < 0) {
3790 error_report("Could not open '%s'", backing_file->value.s);
3791 goto out;
3792 }
3793 bdrv_get_geometry(bs, &size);
3794 size *= 512;
3795
3796 snprintf(buf, sizeof(buf), "%" PRId64, size);
3797 set_option_parameter(param, BLOCK_OPT_SIZE, buf);
3798 } else {
3799 error_report("Image creation needs a size parameter");
3800 ret = -EINVAL;
3801 goto out;
3802 }
3803 }
3804
3805 printf("Formatting '%s', fmt=%s ", filename, fmt);
3806 print_option_parameters(param);
3807 puts("");
3808
3809 ret = bdrv_create(drv, filename, param);
3810
3811 if (ret < 0) {
3812 if (ret == -ENOTSUP) {
3813 error_report("Formatting or formatting option not supported for "
3814 "file format '%s'", fmt);
3815 } else if (ret == -EFBIG) {
3816 error_report("The image size is too large for file format '%s'",
3817 fmt);
3818 } else {
3819 error_report("%s: error while creating %s: %s", filename, fmt,
3820 strerror(-ret));
3821 }
3822 }
3823
3824 out:
3825 free_option_parameters(create_options);
3826 free_option_parameters(param);
3827
3828 if (bs) {
3829 bdrv_delete(bs);
3830 }
3831
3832 return ret;
3833 }