]> git.proxmox.com Git - qemu.git/blob - block.c
block: flush in writethrough mode after writes
[qemu.git] / block.c
1 /*
2 * QEMU System Emulator block driver
3 *
4 * Copyright (c) 2003 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor.h"
28 #include "block_int.h"
29 #include "module.h"
30 #include "qjson.h"
31 #include "qemu-coroutine.h"
32 #include "qmp-commands.h"
33 #include "qemu-timer.h"
34
35 #ifdef CONFIG_BSD
36 #include <sys/types.h>
37 #include <sys/stat.h>
38 #include <sys/ioctl.h>
39 #include <sys/queue.h>
40 #ifndef __DragonFly__
41 #include <sys/disk.h>
42 #endif
43 #endif
44
45 #ifdef _WIN32
46 #include <windows.h>
47 #endif
48
49 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
50
51 typedef enum {
52 BDRV_REQ_COPY_ON_READ = 0x1,
53 BDRV_REQ_ZERO_WRITE = 0x2,
54 } BdrvRequestFlags;
55
56 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
57 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
59 BlockDriverCompletionFunc *cb, void *opaque);
60 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62 BlockDriverCompletionFunc *cb, void *opaque);
63 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64 int64_t sector_num, int nb_sectors,
65 QEMUIOVector *iov);
66 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67 int64_t sector_num, int nb_sectors,
68 QEMUIOVector *iov);
69 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
70 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71 BdrvRequestFlags flags);
72 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
73 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74 BdrvRequestFlags flags);
75 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
76 int64_t sector_num,
77 QEMUIOVector *qiov,
78 int nb_sectors,
79 BlockDriverCompletionFunc *cb,
80 void *opaque,
81 bool is_write);
82 static void coroutine_fn bdrv_co_do_rw(void *opaque);
83 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
84 int64_t sector_num, int nb_sectors);
85
86 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
87 bool is_write, double elapsed_time, uint64_t *wait);
88 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
89 double elapsed_time, uint64_t *wait);
90 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
91 bool is_write, int64_t *wait);
92
93 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
94 QTAILQ_HEAD_INITIALIZER(bdrv_states);
95
96 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
97 QLIST_HEAD_INITIALIZER(bdrv_drivers);
98
99 /* The device to use for VM snapshots */
100 static BlockDriverState *bs_snapshots;
101
102 /* If non-zero, use only whitelisted block drivers */
103 static int use_bdrv_whitelist;
104
105 #ifdef _WIN32
106 static int is_windows_drive_prefix(const char *filename)
107 {
108 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
109 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
110 filename[1] == ':');
111 }
112
113 int is_windows_drive(const char *filename)
114 {
115 if (is_windows_drive_prefix(filename) &&
116 filename[2] == '\0')
117 return 1;
118 if (strstart(filename, "\\\\.\\", NULL) ||
119 strstart(filename, "//./", NULL))
120 return 1;
121 return 0;
122 }
123 #endif
124
125 /* throttling disk I/O limits */
126 void bdrv_io_limits_disable(BlockDriverState *bs)
127 {
128 bs->io_limits_enabled = false;
129
130 while (qemu_co_queue_next(&bs->throttled_reqs));
131
132 if (bs->block_timer) {
133 qemu_del_timer(bs->block_timer);
134 qemu_free_timer(bs->block_timer);
135 bs->block_timer = NULL;
136 }
137
138 bs->slice_start = 0;
139 bs->slice_end = 0;
140 bs->slice_time = 0;
141 memset(&bs->io_base, 0, sizeof(bs->io_base));
142 }
143
144 static void bdrv_block_timer(void *opaque)
145 {
146 BlockDriverState *bs = opaque;
147
148 qemu_co_queue_next(&bs->throttled_reqs);
149 }
150
151 void bdrv_io_limits_enable(BlockDriverState *bs)
152 {
153 qemu_co_queue_init(&bs->throttled_reqs);
154 bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
155 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
156 bs->slice_start = qemu_get_clock_ns(vm_clock);
157 bs->slice_end = bs->slice_start + bs->slice_time;
158 memset(&bs->io_base, 0, sizeof(bs->io_base));
159 bs->io_limits_enabled = true;
160 }
161
162 bool bdrv_io_limits_enabled(BlockDriverState *bs)
163 {
164 BlockIOLimit *io_limits = &bs->io_limits;
165 return io_limits->bps[BLOCK_IO_LIMIT_READ]
166 || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
167 || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
168 || io_limits->iops[BLOCK_IO_LIMIT_READ]
169 || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
170 || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
171 }
172
173 static void bdrv_io_limits_intercept(BlockDriverState *bs,
174 bool is_write, int nb_sectors)
175 {
176 int64_t wait_time = -1;
177
178 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
179 qemu_co_queue_wait(&bs->throttled_reqs);
180 }
181
182 /* In fact, we hope to keep each request's timing, in FIFO mode. The next
183 * throttled requests will not be dequeued until the current request is
184 * allowed to be serviced. So if the current request still exceeds the
185 * limits, it will be inserted to the head. All requests followed it will
186 * be still in throttled_reqs queue.
187 */
188
189 while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
190 qemu_mod_timer(bs->block_timer,
191 wait_time + qemu_get_clock_ns(vm_clock));
192 qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
193 }
194
195 qemu_co_queue_next(&bs->throttled_reqs);
196 }
197
198 /* check if the path starts with "<protocol>:" */
199 static int path_has_protocol(const char *path)
200 {
201 const char *p;
202
203 #ifdef _WIN32
204 if (is_windows_drive(path) ||
205 is_windows_drive_prefix(path)) {
206 return 0;
207 }
208 p = path + strcspn(path, ":/\\");
209 #else
210 p = path + strcspn(path, ":/");
211 #endif
212
213 return *p == ':';
214 }
215
216 int path_is_absolute(const char *path)
217 {
218 #ifdef _WIN32
219 /* specific case for names like: "\\.\d:" */
220 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
221 return 1;
222 }
223 return (*path == '/' || *path == '\\');
224 #else
225 return (*path == '/');
226 #endif
227 }
228
229 /* if filename is absolute, just copy it to dest. Otherwise, build a
230 path to it by considering it is relative to base_path. URL are
231 supported. */
232 void path_combine(char *dest, int dest_size,
233 const char *base_path,
234 const char *filename)
235 {
236 const char *p, *p1;
237 int len;
238
239 if (dest_size <= 0)
240 return;
241 if (path_is_absolute(filename)) {
242 pstrcpy(dest, dest_size, filename);
243 } else {
244 p = strchr(base_path, ':');
245 if (p)
246 p++;
247 else
248 p = base_path;
249 p1 = strrchr(base_path, '/');
250 #ifdef _WIN32
251 {
252 const char *p2;
253 p2 = strrchr(base_path, '\\');
254 if (!p1 || p2 > p1)
255 p1 = p2;
256 }
257 #endif
258 if (p1)
259 p1++;
260 else
261 p1 = base_path;
262 if (p1 > p)
263 p = p1;
264 len = p - base_path;
265 if (len > dest_size - 1)
266 len = dest_size - 1;
267 memcpy(dest, base_path, len);
268 dest[len] = '\0';
269 pstrcat(dest, dest_size, filename);
270 }
271 }
272
273 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
274 {
275 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
276 pstrcpy(dest, sz, bs->backing_file);
277 } else {
278 path_combine(dest, sz, bs->filename, bs->backing_file);
279 }
280 }
281
282 void bdrv_register(BlockDriver *bdrv)
283 {
284 /* Block drivers without coroutine functions need emulation */
285 if (!bdrv->bdrv_co_readv) {
286 bdrv->bdrv_co_readv = bdrv_co_readv_em;
287 bdrv->bdrv_co_writev = bdrv_co_writev_em;
288
289 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
290 * the block driver lacks aio we need to emulate that too.
291 */
292 if (!bdrv->bdrv_aio_readv) {
293 /* add AIO emulation layer */
294 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
295 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
296 }
297 }
298
299 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
300 }
301
302 /* create a new block device (by default it is empty) */
303 BlockDriverState *bdrv_new(const char *device_name)
304 {
305 BlockDriverState *bs;
306
307 bs = g_malloc0(sizeof(BlockDriverState));
308 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
309 if (device_name[0] != '\0') {
310 QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
311 }
312 bdrv_iostatus_disable(bs);
313 return bs;
314 }
315
316 BlockDriver *bdrv_find_format(const char *format_name)
317 {
318 BlockDriver *drv1;
319 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
320 if (!strcmp(drv1->format_name, format_name)) {
321 return drv1;
322 }
323 }
324 return NULL;
325 }
326
327 static int bdrv_is_whitelisted(BlockDriver *drv)
328 {
329 static const char *whitelist[] = {
330 CONFIG_BDRV_WHITELIST
331 };
332 const char **p;
333
334 if (!whitelist[0])
335 return 1; /* no whitelist, anything goes */
336
337 for (p = whitelist; *p; p++) {
338 if (!strcmp(drv->format_name, *p)) {
339 return 1;
340 }
341 }
342 return 0;
343 }
344
345 BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
346 {
347 BlockDriver *drv = bdrv_find_format(format_name);
348 return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
349 }
350
351 typedef struct CreateCo {
352 BlockDriver *drv;
353 char *filename;
354 QEMUOptionParameter *options;
355 int ret;
356 } CreateCo;
357
358 static void coroutine_fn bdrv_create_co_entry(void *opaque)
359 {
360 CreateCo *cco = opaque;
361 assert(cco->drv);
362
363 cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
364 }
365
366 int bdrv_create(BlockDriver *drv, const char* filename,
367 QEMUOptionParameter *options)
368 {
369 int ret;
370
371 Coroutine *co;
372 CreateCo cco = {
373 .drv = drv,
374 .filename = g_strdup(filename),
375 .options = options,
376 .ret = NOT_DONE,
377 };
378
379 if (!drv->bdrv_create) {
380 return -ENOTSUP;
381 }
382
383 if (qemu_in_coroutine()) {
384 /* Fast-path if already in coroutine context */
385 bdrv_create_co_entry(&cco);
386 } else {
387 co = qemu_coroutine_create(bdrv_create_co_entry);
388 qemu_coroutine_enter(co, &cco);
389 while (cco.ret == NOT_DONE) {
390 qemu_aio_wait();
391 }
392 }
393
394 ret = cco.ret;
395 g_free(cco.filename);
396
397 return ret;
398 }
399
400 int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
401 {
402 BlockDriver *drv;
403
404 drv = bdrv_find_protocol(filename);
405 if (drv == NULL) {
406 return -ENOENT;
407 }
408
409 return bdrv_create(drv, filename, options);
410 }
411
412 /*
413 * Create a uniquely-named empty temporary file.
414 * Return 0 upon success, otherwise a negative errno value.
415 */
416 int get_tmp_filename(char *filename, int size)
417 {
418 #ifdef _WIN32
419 char temp_dir[MAX_PATH];
420 /* GetTempFileName requires that its output buffer (4th param)
421 have length MAX_PATH or greater. */
422 assert(size >= MAX_PATH);
423 return (GetTempPath(MAX_PATH, temp_dir)
424 && GetTempFileName(temp_dir, "qem", 0, filename)
425 ? 0 : -GetLastError());
426 #else
427 int fd;
428 const char *tmpdir;
429 tmpdir = getenv("TMPDIR");
430 if (!tmpdir)
431 tmpdir = "/tmp";
432 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
433 return -EOVERFLOW;
434 }
435 fd = mkstemp(filename);
436 if (fd < 0 || close(fd)) {
437 return -errno;
438 }
439 return 0;
440 #endif
441 }
442
443 /*
444 * Detect host devices. By convention, /dev/cdrom[N] is always
445 * recognized as a host CDROM.
446 */
447 static BlockDriver *find_hdev_driver(const char *filename)
448 {
449 int score_max = 0, score;
450 BlockDriver *drv = NULL, *d;
451
452 QLIST_FOREACH(d, &bdrv_drivers, list) {
453 if (d->bdrv_probe_device) {
454 score = d->bdrv_probe_device(filename);
455 if (score > score_max) {
456 score_max = score;
457 drv = d;
458 }
459 }
460 }
461
462 return drv;
463 }
464
465 BlockDriver *bdrv_find_protocol(const char *filename)
466 {
467 BlockDriver *drv1;
468 char protocol[128];
469 int len;
470 const char *p;
471
472 /* TODO Drivers without bdrv_file_open must be specified explicitly */
473
474 /*
475 * XXX(hch): we really should not let host device detection
476 * override an explicit protocol specification, but moving this
477 * later breaks access to device names with colons in them.
478 * Thanks to the brain-dead persistent naming schemes on udev-
479 * based Linux systems those actually are quite common.
480 */
481 drv1 = find_hdev_driver(filename);
482 if (drv1) {
483 return drv1;
484 }
485
486 if (!path_has_protocol(filename)) {
487 return bdrv_find_format("file");
488 }
489 p = strchr(filename, ':');
490 assert(p != NULL);
491 len = p - filename;
492 if (len > sizeof(protocol) - 1)
493 len = sizeof(protocol) - 1;
494 memcpy(protocol, filename, len);
495 protocol[len] = '\0';
496 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
497 if (drv1->protocol_name &&
498 !strcmp(drv1->protocol_name, protocol)) {
499 return drv1;
500 }
501 }
502 return NULL;
503 }
504
505 static int find_image_format(const char *filename, BlockDriver **pdrv)
506 {
507 int ret, score, score_max;
508 BlockDriver *drv1, *drv;
509 uint8_t buf[2048];
510 BlockDriverState *bs;
511
512 ret = bdrv_file_open(&bs, filename, 0);
513 if (ret < 0) {
514 *pdrv = NULL;
515 return ret;
516 }
517
518 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
519 if (bs->sg || !bdrv_is_inserted(bs)) {
520 bdrv_delete(bs);
521 drv = bdrv_find_format("raw");
522 if (!drv) {
523 ret = -ENOENT;
524 }
525 *pdrv = drv;
526 return ret;
527 }
528
529 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
530 bdrv_delete(bs);
531 if (ret < 0) {
532 *pdrv = NULL;
533 return ret;
534 }
535
536 score_max = 0;
537 drv = NULL;
538 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
539 if (drv1->bdrv_probe) {
540 score = drv1->bdrv_probe(buf, ret, filename);
541 if (score > score_max) {
542 score_max = score;
543 drv = drv1;
544 }
545 }
546 }
547 if (!drv) {
548 ret = -ENOENT;
549 }
550 *pdrv = drv;
551 return ret;
552 }
553
554 /**
555 * Set the current 'total_sectors' value
556 */
557 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
558 {
559 BlockDriver *drv = bs->drv;
560
561 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
562 if (bs->sg)
563 return 0;
564
565 /* query actual device if possible, otherwise just trust the hint */
566 if (drv->bdrv_getlength) {
567 int64_t length = drv->bdrv_getlength(bs);
568 if (length < 0) {
569 return length;
570 }
571 hint = length >> BDRV_SECTOR_BITS;
572 }
573
574 bs->total_sectors = hint;
575 return 0;
576 }
577
578 /**
579 * Set open flags for a given cache mode
580 *
581 * Return 0 on success, -1 if the cache mode was invalid.
582 */
583 int bdrv_parse_cache_flags(const char *mode, int *flags)
584 {
585 *flags &= ~BDRV_O_CACHE_MASK;
586
587 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
588 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
589 } else if (!strcmp(mode, "directsync")) {
590 *flags |= BDRV_O_NOCACHE;
591 } else if (!strcmp(mode, "writeback")) {
592 *flags |= BDRV_O_CACHE_WB;
593 } else if (!strcmp(mode, "unsafe")) {
594 *flags |= BDRV_O_CACHE_WB;
595 *flags |= BDRV_O_NO_FLUSH;
596 } else if (!strcmp(mode, "writethrough")) {
597 /* this is the default */
598 } else {
599 return -1;
600 }
601
602 return 0;
603 }
604
605 /**
606 * The copy-on-read flag is actually a reference count so multiple users may
607 * use the feature without worrying about clobbering its previous state.
608 * Copy-on-read stays enabled until all users have called to disable it.
609 */
610 void bdrv_enable_copy_on_read(BlockDriverState *bs)
611 {
612 bs->copy_on_read++;
613 }
614
615 void bdrv_disable_copy_on_read(BlockDriverState *bs)
616 {
617 assert(bs->copy_on_read > 0);
618 bs->copy_on_read--;
619 }
620
621 /*
622 * Common part for opening disk images and files
623 */
624 static int bdrv_open_common(BlockDriverState *bs, const char *filename,
625 int flags, BlockDriver *drv)
626 {
627 int ret, open_flags;
628
629 assert(drv != NULL);
630 assert(bs->file == NULL);
631
632 trace_bdrv_open_common(bs, filename, flags, drv->format_name);
633
634 bs->open_flags = flags;
635 bs->buffer_alignment = 512;
636
637 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
638 if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
639 bdrv_enable_copy_on_read(bs);
640 }
641
642 pstrcpy(bs->filename, sizeof(bs->filename), filename);
643
644 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
645 return -ENOTSUP;
646 }
647
648 bs->drv = drv;
649 bs->opaque = g_malloc0(drv->instance_size);
650
651 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
652
653 /*
654 * Clear flags that are internal to the block layer before opening the
655 * image.
656 */
657 open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
658
659 /*
660 * Snapshots should be writable.
661 */
662 if (bs->is_temporary) {
663 open_flags |= BDRV_O_RDWR;
664 }
665
666 bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
667
668 /* Open the image, either directly or using a protocol */
669 if (drv->bdrv_file_open) {
670 ret = drv->bdrv_file_open(bs, filename, open_flags);
671 } else {
672 ret = bdrv_file_open(&bs->file, filename, open_flags);
673 if (ret >= 0) {
674 ret = drv->bdrv_open(bs, open_flags);
675 }
676 }
677
678 if (ret < 0) {
679 goto free_and_fail;
680 }
681
682 ret = refresh_total_sectors(bs, bs->total_sectors);
683 if (ret < 0) {
684 goto free_and_fail;
685 }
686
687 #ifndef _WIN32
688 if (bs->is_temporary) {
689 unlink(filename);
690 }
691 #endif
692 return 0;
693
694 free_and_fail:
695 if (bs->file) {
696 bdrv_delete(bs->file);
697 bs->file = NULL;
698 }
699 g_free(bs->opaque);
700 bs->opaque = NULL;
701 bs->drv = NULL;
702 return ret;
703 }
704
705 /*
706 * Opens a file using a protocol (file, host_device, nbd, ...)
707 */
708 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
709 {
710 BlockDriverState *bs;
711 BlockDriver *drv;
712 int ret;
713
714 drv = bdrv_find_protocol(filename);
715 if (!drv) {
716 return -ENOENT;
717 }
718
719 bs = bdrv_new("");
720 ret = bdrv_open_common(bs, filename, flags, drv);
721 if (ret < 0) {
722 bdrv_delete(bs);
723 return ret;
724 }
725 bs->growable = 1;
726 *pbs = bs;
727 return 0;
728 }
729
730 /*
731 * Opens a disk image (raw, qcow2, vmdk, ...)
732 */
733 int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
734 BlockDriver *drv)
735 {
736 int ret;
737 char tmp_filename[PATH_MAX];
738
739 if (flags & BDRV_O_SNAPSHOT) {
740 BlockDriverState *bs1;
741 int64_t total_size;
742 int is_protocol = 0;
743 BlockDriver *bdrv_qcow2;
744 QEMUOptionParameter *options;
745 char backing_filename[PATH_MAX];
746
747 /* if snapshot, we create a temporary backing file and open it
748 instead of opening 'filename' directly */
749
750 /* if there is a backing file, use it */
751 bs1 = bdrv_new("");
752 ret = bdrv_open(bs1, filename, 0, drv);
753 if (ret < 0) {
754 bdrv_delete(bs1);
755 return ret;
756 }
757 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
758
759 if (bs1->drv && bs1->drv->protocol_name)
760 is_protocol = 1;
761
762 bdrv_delete(bs1);
763
764 ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
765 if (ret < 0) {
766 return ret;
767 }
768
769 /* Real path is meaningless for protocols */
770 if (is_protocol)
771 snprintf(backing_filename, sizeof(backing_filename),
772 "%s", filename);
773 else if (!realpath(filename, backing_filename))
774 return -errno;
775
776 bdrv_qcow2 = bdrv_find_format("qcow2");
777 options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
778
779 set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
780 set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
781 if (drv) {
782 set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
783 drv->format_name);
784 }
785
786 ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
787 free_option_parameters(options);
788 if (ret < 0) {
789 return ret;
790 }
791
792 filename = tmp_filename;
793 drv = bdrv_qcow2;
794 bs->is_temporary = 1;
795 }
796
797 /* Find the right image format driver */
798 if (!drv) {
799 ret = find_image_format(filename, &drv);
800 }
801
802 if (!drv) {
803 goto unlink_and_fail;
804 }
805
806 /* Open the image */
807 ret = bdrv_open_common(bs, filename, flags, drv);
808 if (ret < 0) {
809 goto unlink_and_fail;
810 }
811
812 /* If there is a backing file, use it */
813 if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
814 char backing_filename[PATH_MAX];
815 int back_flags;
816 BlockDriver *back_drv = NULL;
817
818 bs->backing_hd = bdrv_new("");
819 bdrv_get_full_backing_filename(bs, backing_filename,
820 sizeof(backing_filename));
821
822 if (bs->backing_format[0] != '\0') {
823 back_drv = bdrv_find_format(bs->backing_format);
824 }
825
826 /* backing files always opened read-only */
827 back_flags =
828 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
829
830 ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
831 if (ret < 0) {
832 bdrv_close(bs);
833 return ret;
834 }
835 if (bs->is_temporary) {
836 bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
837 } else {
838 /* base image inherits from "parent" */
839 bs->backing_hd->keep_read_only = bs->keep_read_only;
840 }
841 }
842
843 if (!bdrv_key_required(bs)) {
844 bdrv_dev_change_media_cb(bs, true);
845 }
846
847 /* throttling disk I/O limits */
848 if (bs->io_limits_enabled) {
849 bdrv_io_limits_enable(bs);
850 }
851
852 return 0;
853
854 unlink_and_fail:
855 if (bs->is_temporary) {
856 unlink(filename);
857 }
858 return ret;
859 }
860
861 void bdrv_close(BlockDriverState *bs)
862 {
863 bdrv_flush(bs);
864 if (bs->drv) {
865 if (bs->job) {
866 block_job_cancel_sync(bs->job);
867 }
868 bdrv_drain_all();
869
870 if (bs == bs_snapshots) {
871 bs_snapshots = NULL;
872 }
873 if (bs->backing_hd) {
874 bdrv_delete(bs->backing_hd);
875 bs->backing_hd = NULL;
876 }
877 bs->drv->bdrv_close(bs);
878 g_free(bs->opaque);
879 #ifdef _WIN32
880 if (bs->is_temporary) {
881 unlink(bs->filename);
882 }
883 #endif
884 bs->opaque = NULL;
885 bs->drv = NULL;
886 bs->copy_on_read = 0;
887 bs->backing_file[0] = '\0';
888 bs->backing_format[0] = '\0';
889 bs->total_sectors = 0;
890 bs->encrypted = 0;
891 bs->valid_key = 0;
892 bs->sg = 0;
893 bs->growable = 0;
894
895 if (bs->file != NULL) {
896 bdrv_delete(bs->file);
897 bs->file = NULL;
898 }
899
900 bdrv_dev_change_media_cb(bs, false);
901 }
902
903 /*throttling disk I/O limits*/
904 if (bs->io_limits_enabled) {
905 bdrv_io_limits_disable(bs);
906 }
907 }
908
909 void bdrv_close_all(void)
910 {
911 BlockDriverState *bs;
912
913 QTAILQ_FOREACH(bs, &bdrv_states, list) {
914 bdrv_close(bs);
915 }
916 }
917
918 /*
919 * Wait for pending requests to complete across all BlockDriverStates
920 *
921 * This function does not flush data to disk, use bdrv_flush_all() for that
922 * after calling this function.
923 *
924 * Note that completion of an asynchronous I/O operation can trigger any
925 * number of other I/O operations on other devices---for example a coroutine
926 * can be arbitrarily complex and a constant flow of I/O can come until the
927 * coroutine is complete. Because of this, it is not possible to have a
928 * function to drain a single device's I/O queue.
929 */
930 void bdrv_drain_all(void)
931 {
932 BlockDriverState *bs;
933 bool busy;
934
935 do {
936 busy = qemu_aio_wait();
937
938 /* FIXME: We do not have timer support here, so this is effectively
939 * a busy wait.
940 */
941 QTAILQ_FOREACH(bs, &bdrv_states, list) {
942 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
943 qemu_co_queue_restart_all(&bs->throttled_reqs);
944 busy = true;
945 }
946 }
947 } while (busy);
948
949 /* If requests are still pending there is a bug somewhere */
950 QTAILQ_FOREACH(bs, &bdrv_states, list) {
951 assert(QLIST_EMPTY(&bs->tracked_requests));
952 assert(qemu_co_queue_empty(&bs->throttled_reqs));
953 }
954 }
955
956 /* make a BlockDriverState anonymous by removing from bdrv_state list.
957 Also, NULL terminate the device_name to prevent double remove */
958 void bdrv_make_anon(BlockDriverState *bs)
959 {
960 if (bs->device_name[0] != '\0') {
961 QTAILQ_REMOVE(&bdrv_states, bs, list);
962 }
963 bs->device_name[0] = '\0';
964 }
965
966 static void bdrv_rebind(BlockDriverState *bs)
967 {
968 if (bs->drv && bs->drv->bdrv_rebind) {
969 bs->drv->bdrv_rebind(bs);
970 }
971 }
972
973 /*
974 * Add new bs contents at the top of an image chain while the chain is
975 * live, while keeping required fields on the top layer.
976 *
977 * This will modify the BlockDriverState fields, and swap contents
978 * between bs_new and bs_top. Both bs_new and bs_top are modified.
979 *
980 * bs_new is required to be anonymous.
981 *
982 * This function does not create any image files.
983 */
984 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
985 {
986 BlockDriverState tmp;
987
988 /* bs_new must be anonymous */
989 assert(bs_new->device_name[0] == '\0');
990
991 tmp = *bs_new;
992
993 /* there are some fields that need to stay on the top layer: */
994 tmp.open_flags = bs_top->open_flags;
995
996 /* dev info */
997 tmp.dev_ops = bs_top->dev_ops;
998 tmp.dev_opaque = bs_top->dev_opaque;
999 tmp.dev = bs_top->dev;
1000 tmp.buffer_alignment = bs_top->buffer_alignment;
1001 tmp.copy_on_read = bs_top->copy_on_read;
1002
1003 /* i/o timing parameters */
1004 tmp.slice_time = bs_top->slice_time;
1005 tmp.slice_start = bs_top->slice_start;
1006 tmp.slice_end = bs_top->slice_end;
1007 tmp.io_limits = bs_top->io_limits;
1008 tmp.io_base = bs_top->io_base;
1009 tmp.throttled_reqs = bs_top->throttled_reqs;
1010 tmp.block_timer = bs_top->block_timer;
1011 tmp.io_limits_enabled = bs_top->io_limits_enabled;
1012
1013 /* geometry */
1014 tmp.cyls = bs_top->cyls;
1015 tmp.heads = bs_top->heads;
1016 tmp.secs = bs_top->secs;
1017 tmp.translation = bs_top->translation;
1018
1019 /* r/w error */
1020 tmp.on_read_error = bs_top->on_read_error;
1021 tmp.on_write_error = bs_top->on_write_error;
1022
1023 /* i/o status */
1024 tmp.iostatus_enabled = bs_top->iostatus_enabled;
1025 tmp.iostatus = bs_top->iostatus;
1026
1027 /* keep the same entry in bdrv_states */
1028 pstrcpy(tmp.device_name, sizeof(tmp.device_name), bs_top->device_name);
1029 tmp.list = bs_top->list;
1030
1031 /* The contents of 'tmp' will become bs_top, as we are
1032 * swapping bs_new and bs_top contents. */
1033 tmp.backing_hd = bs_new;
1034 pstrcpy(tmp.backing_file, sizeof(tmp.backing_file), bs_top->filename);
1035 bdrv_get_format(bs_top, tmp.backing_format, sizeof(tmp.backing_format));
1036
1037 /* swap contents of the fixed new bs and the current top */
1038 *bs_new = *bs_top;
1039 *bs_top = tmp;
1040
1041 /* device_name[] was carried over from the old bs_top. bs_new
1042 * shouldn't be in bdrv_states, so we need to make device_name[]
1043 * reflect the anonymity of bs_new
1044 */
1045 bs_new->device_name[0] = '\0';
1046
1047 /* clear the copied fields in the new backing file */
1048 bdrv_detach_dev(bs_new, bs_new->dev);
1049
1050 qemu_co_queue_init(&bs_new->throttled_reqs);
1051 memset(&bs_new->io_base, 0, sizeof(bs_new->io_base));
1052 memset(&bs_new->io_limits, 0, sizeof(bs_new->io_limits));
1053 bdrv_iostatus_disable(bs_new);
1054
1055 /* we don't use bdrv_io_limits_disable() for this, because we don't want
1056 * to affect or delete the block_timer, as it has been moved to bs_top */
1057 bs_new->io_limits_enabled = false;
1058 bs_new->block_timer = NULL;
1059 bs_new->slice_time = 0;
1060 bs_new->slice_start = 0;
1061 bs_new->slice_end = 0;
1062
1063 bdrv_rebind(bs_new);
1064 bdrv_rebind(bs_top);
1065 }
1066
1067 void bdrv_delete(BlockDriverState *bs)
1068 {
1069 assert(!bs->dev);
1070 assert(!bs->job);
1071 assert(!bs->in_use);
1072
1073 /* remove from list, if necessary */
1074 bdrv_make_anon(bs);
1075
1076 bdrv_close(bs);
1077
1078 assert(bs != bs_snapshots);
1079 g_free(bs);
1080 }
1081
1082 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1083 /* TODO change to DeviceState *dev when all users are qdevified */
1084 {
1085 if (bs->dev) {
1086 return -EBUSY;
1087 }
1088 bs->dev = dev;
1089 bdrv_iostatus_reset(bs);
1090 return 0;
1091 }
1092
1093 /* TODO qdevified devices don't use this, remove when devices are qdevified */
1094 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1095 {
1096 if (bdrv_attach_dev(bs, dev) < 0) {
1097 abort();
1098 }
1099 }
1100
1101 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1102 /* TODO change to DeviceState *dev when all users are qdevified */
1103 {
1104 assert(bs->dev == dev);
1105 bs->dev = NULL;
1106 bs->dev_ops = NULL;
1107 bs->dev_opaque = NULL;
1108 bs->buffer_alignment = 512;
1109 }
1110
1111 /* TODO change to return DeviceState * when all users are qdevified */
1112 void *bdrv_get_attached_dev(BlockDriverState *bs)
1113 {
1114 return bs->dev;
1115 }
1116
1117 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1118 void *opaque)
1119 {
1120 bs->dev_ops = ops;
1121 bs->dev_opaque = opaque;
1122 if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1123 bs_snapshots = NULL;
1124 }
1125 }
1126
1127 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1128 BlockQMPEventAction action, int is_read)
1129 {
1130 QObject *data;
1131 const char *action_str;
1132
1133 switch (action) {
1134 case BDRV_ACTION_REPORT:
1135 action_str = "report";
1136 break;
1137 case BDRV_ACTION_IGNORE:
1138 action_str = "ignore";
1139 break;
1140 case BDRV_ACTION_STOP:
1141 action_str = "stop";
1142 break;
1143 default:
1144 abort();
1145 }
1146
1147 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1148 bdrv->device_name,
1149 action_str,
1150 is_read ? "read" : "write");
1151 monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1152
1153 qobject_decref(data);
1154 }
1155
1156 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1157 {
1158 QObject *data;
1159
1160 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1161 bdrv_get_device_name(bs), ejected);
1162 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1163
1164 qobject_decref(data);
1165 }
1166
1167 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1168 {
1169 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1170 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1171 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1172 if (tray_was_closed) {
1173 /* tray open */
1174 bdrv_emit_qmp_eject_event(bs, true);
1175 }
1176 if (load) {
1177 /* tray close */
1178 bdrv_emit_qmp_eject_event(bs, false);
1179 }
1180 }
1181 }
1182
1183 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1184 {
1185 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1186 }
1187
1188 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1189 {
1190 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1191 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1192 }
1193 }
1194
1195 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1196 {
1197 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1198 return bs->dev_ops->is_tray_open(bs->dev_opaque);
1199 }
1200 return false;
1201 }
1202
1203 static void bdrv_dev_resize_cb(BlockDriverState *bs)
1204 {
1205 if (bs->dev_ops && bs->dev_ops->resize_cb) {
1206 bs->dev_ops->resize_cb(bs->dev_opaque);
1207 }
1208 }
1209
1210 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1211 {
1212 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1213 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1214 }
1215 return false;
1216 }
1217
1218 /*
1219 * Run consistency checks on an image
1220 *
1221 * Returns 0 if the check could be completed (it doesn't mean that the image is
1222 * free of errors) or -errno when an internal error occurred. The results of the
1223 * check are stored in res.
1224 */
1225 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1226 {
1227 if (bs->drv->bdrv_check == NULL) {
1228 return -ENOTSUP;
1229 }
1230
1231 memset(res, 0, sizeof(*res));
1232 return bs->drv->bdrv_check(bs, res, fix);
1233 }
1234
1235 #define COMMIT_BUF_SECTORS 2048
1236
1237 /* commit COW file into the raw image */
1238 int bdrv_commit(BlockDriverState *bs)
1239 {
1240 BlockDriver *drv = bs->drv;
1241 BlockDriver *backing_drv;
1242 int64_t sector, total_sectors;
1243 int n, ro, open_flags;
1244 int ret = 0, rw_ret = 0;
1245 uint8_t *buf;
1246 char filename[1024];
1247 BlockDriverState *bs_rw, *bs_ro;
1248
1249 if (!drv)
1250 return -ENOMEDIUM;
1251
1252 if (!bs->backing_hd) {
1253 return -ENOTSUP;
1254 }
1255
1256 if (bs->backing_hd->keep_read_only) {
1257 return -EACCES;
1258 }
1259
1260 if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1261 return -EBUSY;
1262 }
1263
1264 backing_drv = bs->backing_hd->drv;
1265 ro = bs->backing_hd->read_only;
1266 strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1267 open_flags = bs->backing_hd->open_flags;
1268
1269 if (ro) {
1270 /* re-open as RW */
1271 bdrv_delete(bs->backing_hd);
1272 bs->backing_hd = NULL;
1273 bs_rw = bdrv_new("");
1274 rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1275 backing_drv);
1276 if (rw_ret < 0) {
1277 bdrv_delete(bs_rw);
1278 /* try to re-open read-only */
1279 bs_ro = bdrv_new("");
1280 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1281 backing_drv);
1282 if (ret < 0) {
1283 bdrv_delete(bs_ro);
1284 /* drive not functional anymore */
1285 bs->drv = NULL;
1286 return ret;
1287 }
1288 bs->backing_hd = bs_ro;
1289 return rw_ret;
1290 }
1291 bs->backing_hd = bs_rw;
1292 }
1293
1294 total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1295 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1296
1297 for (sector = 0; sector < total_sectors; sector += n) {
1298 if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1299
1300 if (bdrv_read(bs, sector, buf, n) != 0) {
1301 ret = -EIO;
1302 goto ro_cleanup;
1303 }
1304
1305 if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1306 ret = -EIO;
1307 goto ro_cleanup;
1308 }
1309 }
1310 }
1311
1312 if (drv->bdrv_make_empty) {
1313 ret = drv->bdrv_make_empty(bs);
1314 bdrv_flush(bs);
1315 }
1316
1317 /*
1318 * Make sure all data we wrote to the backing device is actually
1319 * stable on disk.
1320 */
1321 if (bs->backing_hd)
1322 bdrv_flush(bs->backing_hd);
1323
1324 ro_cleanup:
1325 g_free(buf);
1326
1327 if (ro) {
1328 /* re-open as RO */
1329 bdrv_delete(bs->backing_hd);
1330 bs->backing_hd = NULL;
1331 bs_ro = bdrv_new("");
1332 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1333 backing_drv);
1334 if (ret < 0) {
1335 bdrv_delete(bs_ro);
1336 /* drive not functional anymore */
1337 bs->drv = NULL;
1338 return ret;
1339 }
1340 bs->backing_hd = bs_ro;
1341 bs->backing_hd->keep_read_only = 0;
1342 }
1343
1344 return ret;
1345 }
1346
1347 int bdrv_commit_all(void)
1348 {
1349 BlockDriverState *bs;
1350
1351 QTAILQ_FOREACH(bs, &bdrv_states, list) {
1352 int ret = bdrv_commit(bs);
1353 if (ret < 0) {
1354 return ret;
1355 }
1356 }
1357 return 0;
1358 }
1359
1360 struct BdrvTrackedRequest {
1361 BlockDriverState *bs;
1362 int64_t sector_num;
1363 int nb_sectors;
1364 bool is_write;
1365 QLIST_ENTRY(BdrvTrackedRequest) list;
1366 Coroutine *co; /* owner, used for deadlock detection */
1367 CoQueue wait_queue; /* coroutines blocked on this request */
1368 };
1369
1370 /**
1371 * Remove an active request from the tracked requests list
1372 *
1373 * This function should be called when a tracked request is completing.
1374 */
1375 static void tracked_request_end(BdrvTrackedRequest *req)
1376 {
1377 QLIST_REMOVE(req, list);
1378 qemu_co_queue_restart_all(&req->wait_queue);
1379 }
1380
1381 /**
1382 * Add an active request to the tracked requests list
1383 */
1384 static void tracked_request_begin(BdrvTrackedRequest *req,
1385 BlockDriverState *bs,
1386 int64_t sector_num,
1387 int nb_sectors, bool is_write)
1388 {
1389 *req = (BdrvTrackedRequest){
1390 .bs = bs,
1391 .sector_num = sector_num,
1392 .nb_sectors = nb_sectors,
1393 .is_write = is_write,
1394 .co = qemu_coroutine_self(),
1395 };
1396
1397 qemu_co_queue_init(&req->wait_queue);
1398
1399 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1400 }
1401
1402 /**
1403 * Round a region to cluster boundaries
1404 */
1405 static void round_to_clusters(BlockDriverState *bs,
1406 int64_t sector_num, int nb_sectors,
1407 int64_t *cluster_sector_num,
1408 int *cluster_nb_sectors)
1409 {
1410 BlockDriverInfo bdi;
1411
1412 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1413 *cluster_sector_num = sector_num;
1414 *cluster_nb_sectors = nb_sectors;
1415 } else {
1416 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1417 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1418 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1419 nb_sectors, c);
1420 }
1421 }
1422
1423 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1424 int64_t sector_num, int nb_sectors) {
1425 /* aaaa bbbb */
1426 if (sector_num >= req->sector_num + req->nb_sectors) {
1427 return false;
1428 }
1429 /* bbbb aaaa */
1430 if (req->sector_num >= sector_num + nb_sectors) {
1431 return false;
1432 }
1433 return true;
1434 }
1435
1436 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1437 int64_t sector_num, int nb_sectors)
1438 {
1439 BdrvTrackedRequest *req;
1440 int64_t cluster_sector_num;
1441 int cluster_nb_sectors;
1442 bool retry;
1443
1444 /* If we touch the same cluster it counts as an overlap. This guarantees
1445 * that allocating writes will be serialized and not race with each other
1446 * for the same cluster. For example, in copy-on-read it ensures that the
1447 * CoR read and write operations are atomic and guest writes cannot
1448 * interleave between them.
1449 */
1450 round_to_clusters(bs, sector_num, nb_sectors,
1451 &cluster_sector_num, &cluster_nb_sectors);
1452
1453 do {
1454 retry = false;
1455 QLIST_FOREACH(req, &bs->tracked_requests, list) {
1456 if (tracked_request_overlaps(req, cluster_sector_num,
1457 cluster_nb_sectors)) {
1458 /* Hitting this means there was a reentrant request, for
1459 * example, a block driver issuing nested requests. This must
1460 * never happen since it means deadlock.
1461 */
1462 assert(qemu_coroutine_self() != req->co);
1463
1464 qemu_co_queue_wait(&req->wait_queue);
1465 retry = true;
1466 break;
1467 }
1468 }
1469 } while (retry);
1470 }
1471
1472 /*
1473 * Return values:
1474 * 0 - success
1475 * -EINVAL - backing format specified, but no file
1476 * -ENOSPC - can't update the backing file because no space is left in the
1477 * image file header
1478 * -ENOTSUP - format driver doesn't support changing the backing file
1479 */
1480 int bdrv_change_backing_file(BlockDriverState *bs,
1481 const char *backing_file, const char *backing_fmt)
1482 {
1483 BlockDriver *drv = bs->drv;
1484 int ret;
1485
1486 /* Backing file format doesn't make sense without a backing file */
1487 if (backing_fmt && !backing_file) {
1488 return -EINVAL;
1489 }
1490
1491 if (drv->bdrv_change_backing_file != NULL) {
1492 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1493 } else {
1494 ret = -ENOTSUP;
1495 }
1496
1497 if (ret == 0) {
1498 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1499 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1500 }
1501 return ret;
1502 }
1503
1504 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1505 size_t size)
1506 {
1507 int64_t len;
1508
1509 if (!bdrv_is_inserted(bs))
1510 return -ENOMEDIUM;
1511
1512 if (bs->growable)
1513 return 0;
1514
1515 len = bdrv_getlength(bs);
1516
1517 if (offset < 0)
1518 return -EIO;
1519
1520 if ((offset > len) || (len - offset < size))
1521 return -EIO;
1522
1523 return 0;
1524 }
1525
1526 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1527 int nb_sectors)
1528 {
1529 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1530 nb_sectors * BDRV_SECTOR_SIZE);
1531 }
1532
1533 typedef struct RwCo {
1534 BlockDriverState *bs;
1535 int64_t sector_num;
1536 int nb_sectors;
1537 QEMUIOVector *qiov;
1538 bool is_write;
1539 int ret;
1540 } RwCo;
1541
1542 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1543 {
1544 RwCo *rwco = opaque;
1545
1546 if (!rwco->is_write) {
1547 rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1548 rwco->nb_sectors, rwco->qiov, 0);
1549 } else {
1550 rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1551 rwco->nb_sectors, rwco->qiov, 0);
1552 }
1553 }
1554
1555 /*
1556 * Process a synchronous request using coroutines
1557 */
1558 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1559 int nb_sectors, bool is_write)
1560 {
1561 QEMUIOVector qiov;
1562 struct iovec iov = {
1563 .iov_base = (void *)buf,
1564 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1565 };
1566 Coroutine *co;
1567 RwCo rwco = {
1568 .bs = bs,
1569 .sector_num = sector_num,
1570 .nb_sectors = nb_sectors,
1571 .qiov = &qiov,
1572 .is_write = is_write,
1573 .ret = NOT_DONE,
1574 };
1575
1576 qemu_iovec_init_external(&qiov, &iov, 1);
1577
1578 /**
1579 * In sync call context, when the vcpu is blocked, this throttling timer
1580 * will not fire; so the I/O throttling function has to be disabled here
1581 * if it has been enabled.
1582 */
1583 if (bs->io_limits_enabled) {
1584 fprintf(stderr, "Disabling I/O throttling on '%s' due "
1585 "to synchronous I/O.\n", bdrv_get_device_name(bs));
1586 bdrv_io_limits_disable(bs);
1587 }
1588
1589 if (qemu_in_coroutine()) {
1590 /* Fast-path if already in coroutine context */
1591 bdrv_rw_co_entry(&rwco);
1592 } else {
1593 co = qemu_coroutine_create(bdrv_rw_co_entry);
1594 qemu_coroutine_enter(co, &rwco);
1595 while (rwco.ret == NOT_DONE) {
1596 qemu_aio_wait();
1597 }
1598 }
1599 return rwco.ret;
1600 }
1601
1602 /* return < 0 if error. See bdrv_write() for the return codes */
1603 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1604 uint8_t *buf, int nb_sectors)
1605 {
1606 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1607 }
1608
1609 #define BITS_PER_LONG (sizeof(unsigned long) * 8)
1610
1611 static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1612 int nb_sectors, int dirty)
1613 {
1614 int64_t start, end;
1615 unsigned long val, idx, bit;
1616
1617 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1618 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1619
1620 for (; start <= end; start++) {
1621 idx = start / BITS_PER_LONG;
1622 bit = start % BITS_PER_LONG;
1623 val = bs->dirty_bitmap[idx];
1624 if (dirty) {
1625 if (!(val & (1UL << bit))) {
1626 bs->dirty_count++;
1627 val |= 1UL << bit;
1628 }
1629 } else {
1630 if (val & (1UL << bit)) {
1631 bs->dirty_count--;
1632 val &= ~(1UL << bit);
1633 }
1634 }
1635 bs->dirty_bitmap[idx] = val;
1636 }
1637 }
1638
1639 /* Return < 0 if error. Important errors are:
1640 -EIO generic I/O error (may happen for all errors)
1641 -ENOMEDIUM No media inserted.
1642 -EINVAL Invalid sector number or nb_sectors
1643 -EACCES Trying to write a read-only device
1644 */
1645 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1646 const uint8_t *buf, int nb_sectors)
1647 {
1648 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1649 }
1650
1651 int bdrv_pread(BlockDriverState *bs, int64_t offset,
1652 void *buf, int count1)
1653 {
1654 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1655 int len, nb_sectors, count;
1656 int64_t sector_num;
1657 int ret;
1658
1659 count = count1;
1660 /* first read to align to sector start */
1661 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1662 if (len > count)
1663 len = count;
1664 sector_num = offset >> BDRV_SECTOR_BITS;
1665 if (len > 0) {
1666 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1667 return ret;
1668 memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1669 count -= len;
1670 if (count == 0)
1671 return count1;
1672 sector_num++;
1673 buf += len;
1674 }
1675
1676 /* read the sectors "in place" */
1677 nb_sectors = count >> BDRV_SECTOR_BITS;
1678 if (nb_sectors > 0) {
1679 if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1680 return ret;
1681 sector_num += nb_sectors;
1682 len = nb_sectors << BDRV_SECTOR_BITS;
1683 buf += len;
1684 count -= len;
1685 }
1686
1687 /* add data from the last sector */
1688 if (count > 0) {
1689 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1690 return ret;
1691 memcpy(buf, tmp_buf, count);
1692 }
1693 return count1;
1694 }
1695
1696 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1697 const void *buf, int count1)
1698 {
1699 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1700 int len, nb_sectors, count;
1701 int64_t sector_num;
1702 int ret;
1703
1704 count = count1;
1705 /* first write to align to sector start */
1706 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1707 if (len > count)
1708 len = count;
1709 sector_num = offset >> BDRV_SECTOR_BITS;
1710 if (len > 0) {
1711 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1712 return ret;
1713 memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1714 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1715 return ret;
1716 count -= len;
1717 if (count == 0)
1718 return count1;
1719 sector_num++;
1720 buf += len;
1721 }
1722
1723 /* write the sectors "in place" */
1724 nb_sectors = count >> BDRV_SECTOR_BITS;
1725 if (nb_sectors > 0) {
1726 if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1727 return ret;
1728 sector_num += nb_sectors;
1729 len = nb_sectors << BDRV_SECTOR_BITS;
1730 buf += len;
1731 count -= len;
1732 }
1733
1734 /* add data from the last sector */
1735 if (count > 0) {
1736 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1737 return ret;
1738 memcpy(tmp_buf, buf, count);
1739 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1740 return ret;
1741 }
1742 return count1;
1743 }
1744
1745 /*
1746 * Writes to the file and ensures that no writes are reordered across this
1747 * request (acts as a barrier)
1748 *
1749 * Returns 0 on success, -errno in error cases.
1750 */
1751 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1752 const void *buf, int count)
1753 {
1754 int ret;
1755
1756 ret = bdrv_pwrite(bs, offset, buf, count);
1757 if (ret < 0) {
1758 return ret;
1759 }
1760
1761 /* No flush needed for cache modes that already do it */
1762 if (bs->enable_write_cache) {
1763 bdrv_flush(bs);
1764 }
1765
1766 return 0;
1767 }
1768
1769 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
1770 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1771 {
1772 /* Perform I/O through a temporary buffer so that users who scribble over
1773 * their read buffer while the operation is in progress do not end up
1774 * modifying the image file. This is critical for zero-copy guest I/O
1775 * where anything might happen inside guest memory.
1776 */
1777 void *bounce_buffer;
1778
1779 BlockDriver *drv = bs->drv;
1780 struct iovec iov;
1781 QEMUIOVector bounce_qiov;
1782 int64_t cluster_sector_num;
1783 int cluster_nb_sectors;
1784 size_t skip_bytes;
1785 int ret;
1786
1787 /* Cover entire cluster so no additional backing file I/O is required when
1788 * allocating cluster in the image file.
1789 */
1790 round_to_clusters(bs, sector_num, nb_sectors,
1791 &cluster_sector_num, &cluster_nb_sectors);
1792
1793 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1794 cluster_sector_num, cluster_nb_sectors);
1795
1796 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1797 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1798 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1799
1800 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1801 &bounce_qiov);
1802 if (ret < 0) {
1803 goto err;
1804 }
1805
1806 if (drv->bdrv_co_write_zeroes &&
1807 buffer_is_zero(bounce_buffer, iov.iov_len)) {
1808 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
1809 cluster_nb_sectors);
1810 } else {
1811 /* This does not change the data on the disk, it is not necessary
1812 * to flush even in cache=writethrough mode.
1813 */
1814 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
1815 &bounce_qiov);
1816 }
1817
1818 if (ret < 0) {
1819 /* It might be okay to ignore write errors for guest requests. If this
1820 * is a deliberate copy-on-read then we don't want to ignore the error.
1821 * Simply report it in all cases.
1822 */
1823 goto err;
1824 }
1825
1826 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1827 qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
1828 nb_sectors * BDRV_SECTOR_SIZE);
1829
1830 err:
1831 qemu_vfree(bounce_buffer);
1832 return ret;
1833 }
1834
1835 /*
1836 * Handle a read request in coroutine context
1837 */
1838 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1839 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1840 BdrvRequestFlags flags)
1841 {
1842 BlockDriver *drv = bs->drv;
1843 BdrvTrackedRequest req;
1844 int ret;
1845
1846 if (!drv) {
1847 return -ENOMEDIUM;
1848 }
1849 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1850 return -EIO;
1851 }
1852
1853 /* throttling disk read I/O */
1854 if (bs->io_limits_enabled) {
1855 bdrv_io_limits_intercept(bs, false, nb_sectors);
1856 }
1857
1858 if (bs->copy_on_read) {
1859 flags |= BDRV_REQ_COPY_ON_READ;
1860 }
1861 if (flags & BDRV_REQ_COPY_ON_READ) {
1862 bs->copy_on_read_in_flight++;
1863 }
1864
1865 if (bs->copy_on_read_in_flight) {
1866 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1867 }
1868
1869 tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
1870
1871 if (flags & BDRV_REQ_COPY_ON_READ) {
1872 int pnum;
1873
1874 ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1875 if (ret < 0) {
1876 goto out;
1877 }
1878
1879 if (!ret || pnum != nb_sectors) {
1880 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
1881 goto out;
1882 }
1883 }
1884
1885 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1886
1887 out:
1888 tracked_request_end(&req);
1889
1890 if (flags & BDRV_REQ_COPY_ON_READ) {
1891 bs->copy_on_read_in_flight--;
1892 }
1893
1894 return ret;
1895 }
1896
1897 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1898 int nb_sectors, QEMUIOVector *qiov)
1899 {
1900 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1901
1902 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1903 }
1904
1905 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1906 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1907 {
1908 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1909
1910 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1911 BDRV_REQ_COPY_ON_READ);
1912 }
1913
1914 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1915 int64_t sector_num, int nb_sectors)
1916 {
1917 BlockDriver *drv = bs->drv;
1918 QEMUIOVector qiov;
1919 struct iovec iov;
1920 int ret;
1921
1922 /* TODO Emulate only part of misaligned requests instead of letting block
1923 * drivers return -ENOTSUP and emulate everything */
1924
1925 /* First try the efficient write zeroes operation */
1926 if (drv->bdrv_co_write_zeroes) {
1927 ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1928 if (ret != -ENOTSUP) {
1929 return ret;
1930 }
1931 }
1932
1933 /* Fall back to bounce buffer if write zeroes is unsupported */
1934 iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
1935 iov.iov_base = qemu_blockalign(bs, iov.iov_len);
1936 memset(iov.iov_base, 0, iov.iov_len);
1937 qemu_iovec_init_external(&qiov, &iov, 1);
1938
1939 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1940
1941 qemu_vfree(iov.iov_base);
1942 return ret;
1943 }
1944
1945 /*
1946 * Handle a write request in coroutine context
1947 */
1948 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1949 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1950 BdrvRequestFlags flags)
1951 {
1952 BlockDriver *drv = bs->drv;
1953 BdrvTrackedRequest req;
1954 int ret;
1955
1956 if (!bs->drv) {
1957 return -ENOMEDIUM;
1958 }
1959 if (bs->read_only) {
1960 return -EACCES;
1961 }
1962 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1963 return -EIO;
1964 }
1965
1966 /* throttling disk write I/O */
1967 if (bs->io_limits_enabled) {
1968 bdrv_io_limits_intercept(bs, true, nb_sectors);
1969 }
1970
1971 if (bs->copy_on_read_in_flight) {
1972 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1973 }
1974
1975 tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
1976
1977 if (flags & BDRV_REQ_ZERO_WRITE) {
1978 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
1979 } else {
1980 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1981 }
1982
1983 if (ret == 0 && !bs->enable_write_cache) {
1984 ret = bdrv_co_flush(bs);
1985 }
1986
1987 if (bs->dirty_bitmap) {
1988 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1989 }
1990
1991 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1992 bs->wr_highest_sector = sector_num + nb_sectors - 1;
1993 }
1994
1995 tracked_request_end(&req);
1996
1997 return ret;
1998 }
1999
2000 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
2001 int nb_sectors, QEMUIOVector *qiov)
2002 {
2003 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
2004
2005 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
2006 }
2007
2008 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
2009 int64_t sector_num, int nb_sectors)
2010 {
2011 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2012
2013 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
2014 BDRV_REQ_ZERO_WRITE);
2015 }
2016
2017 /**
2018 * Truncate file to 'offset' bytes (needed only for file protocols)
2019 */
2020 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2021 {
2022 BlockDriver *drv = bs->drv;
2023 int ret;
2024 if (!drv)
2025 return -ENOMEDIUM;
2026 if (!drv->bdrv_truncate)
2027 return -ENOTSUP;
2028 if (bs->read_only)
2029 return -EACCES;
2030 if (bdrv_in_use(bs))
2031 return -EBUSY;
2032 ret = drv->bdrv_truncate(bs, offset);
2033 if (ret == 0) {
2034 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2035 bdrv_dev_resize_cb(bs);
2036 }
2037 return ret;
2038 }
2039
2040 /**
2041 * Length of a allocated file in bytes. Sparse files are counted by actual
2042 * allocated space. Return < 0 if error or unknown.
2043 */
2044 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2045 {
2046 BlockDriver *drv = bs->drv;
2047 if (!drv) {
2048 return -ENOMEDIUM;
2049 }
2050 if (drv->bdrv_get_allocated_file_size) {
2051 return drv->bdrv_get_allocated_file_size(bs);
2052 }
2053 if (bs->file) {
2054 return bdrv_get_allocated_file_size(bs->file);
2055 }
2056 return -ENOTSUP;
2057 }
2058
2059 /**
2060 * Length of a file in bytes. Return < 0 if error or unknown.
2061 */
2062 int64_t bdrv_getlength(BlockDriverState *bs)
2063 {
2064 BlockDriver *drv = bs->drv;
2065 if (!drv)
2066 return -ENOMEDIUM;
2067
2068 if (bs->growable || bdrv_dev_has_removable_media(bs)) {
2069 if (drv->bdrv_getlength) {
2070 return drv->bdrv_getlength(bs);
2071 }
2072 }
2073 return bs->total_sectors * BDRV_SECTOR_SIZE;
2074 }
2075
2076 /* return 0 as number of sectors if no device present or error */
2077 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2078 {
2079 int64_t length;
2080 length = bdrv_getlength(bs);
2081 if (length < 0)
2082 length = 0;
2083 else
2084 length = length >> BDRV_SECTOR_BITS;
2085 *nb_sectors_ptr = length;
2086 }
2087
2088 struct partition {
2089 uint8_t boot_ind; /* 0x80 - active */
2090 uint8_t head; /* starting head */
2091 uint8_t sector; /* starting sector */
2092 uint8_t cyl; /* starting cylinder */
2093 uint8_t sys_ind; /* What partition type */
2094 uint8_t end_head; /* end head */
2095 uint8_t end_sector; /* end sector */
2096 uint8_t end_cyl; /* end cylinder */
2097 uint32_t start_sect; /* starting sector counting from 0 */
2098 uint32_t nr_sects; /* nr of sectors in partition */
2099 } QEMU_PACKED;
2100
2101 /* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
2102 static int guess_disk_lchs(BlockDriverState *bs,
2103 int *pcylinders, int *pheads, int *psectors)
2104 {
2105 uint8_t buf[BDRV_SECTOR_SIZE];
2106 int ret, i, heads, sectors, cylinders;
2107 struct partition *p;
2108 uint32_t nr_sects;
2109 uint64_t nb_sectors;
2110 bool enabled;
2111
2112 bdrv_get_geometry(bs, &nb_sectors);
2113
2114 /**
2115 * The function will be invoked during startup not only in sync I/O mode,
2116 * but also in async I/O mode. So the I/O throttling function has to
2117 * be disabled temporarily here, not permanently.
2118 */
2119 enabled = bs->io_limits_enabled;
2120 bs->io_limits_enabled = false;
2121 ret = bdrv_read(bs, 0, buf, 1);
2122 bs->io_limits_enabled = enabled;
2123 if (ret < 0)
2124 return -1;
2125 /* test msdos magic */
2126 if (buf[510] != 0x55 || buf[511] != 0xaa)
2127 return -1;
2128 for(i = 0; i < 4; i++) {
2129 p = ((struct partition *)(buf + 0x1be)) + i;
2130 nr_sects = le32_to_cpu(p->nr_sects);
2131 if (nr_sects && p->end_head) {
2132 /* We make the assumption that the partition terminates on
2133 a cylinder boundary */
2134 heads = p->end_head + 1;
2135 sectors = p->end_sector & 63;
2136 if (sectors == 0)
2137 continue;
2138 cylinders = nb_sectors / (heads * sectors);
2139 if (cylinders < 1 || cylinders > 16383)
2140 continue;
2141 *pheads = heads;
2142 *psectors = sectors;
2143 *pcylinders = cylinders;
2144 #if 0
2145 printf("guessed geometry: LCHS=%d %d %d\n",
2146 cylinders, heads, sectors);
2147 #endif
2148 return 0;
2149 }
2150 }
2151 return -1;
2152 }
2153
2154 void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
2155 {
2156 int translation, lba_detected = 0;
2157 int cylinders, heads, secs;
2158 uint64_t nb_sectors;
2159
2160 /* if a geometry hint is available, use it */
2161 bdrv_get_geometry(bs, &nb_sectors);
2162 bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
2163 translation = bdrv_get_translation_hint(bs);
2164 if (cylinders != 0) {
2165 *pcyls = cylinders;
2166 *pheads = heads;
2167 *psecs = secs;
2168 } else {
2169 if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
2170 if (heads > 16) {
2171 /* if heads > 16, it means that a BIOS LBA
2172 translation was active, so the default
2173 hardware geometry is OK */
2174 lba_detected = 1;
2175 goto default_geometry;
2176 } else {
2177 *pcyls = cylinders;
2178 *pheads = heads;
2179 *psecs = secs;
2180 /* disable any translation to be in sync with
2181 the logical geometry */
2182 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
2183 bdrv_set_translation_hint(bs,
2184 BIOS_ATA_TRANSLATION_NONE);
2185 }
2186 }
2187 } else {
2188 default_geometry:
2189 /* if no geometry, use a standard physical disk geometry */
2190 cylinders = nb_sectors / (16 * 63);
2191
2192 if (cylinders > 16383)
2193 cylinders = 16383;
2194 else if (cylinders < 2)
2195 cylinders = 2;
2196 *pcyls = cylinders;
2197 *pheads = 16;
2198 *psecs = 63;
2199 if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
2200 if ((*pcyls * *pheads) <= 131072) {
2201 bdrv_set_translation_hint(bs,
2202 BIOS_ATA_TRANSLATION_LARGE);
2203 } else {
2204 bdrv_set_translation_hint(bs,
2205 BIOS_ATA_TRANSLATION_LBA);
2206 }
2207 }
2208 }
2209 bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
2210 }
2211 }
2212
2213 void bdrv_set_geometry_hint(BlockDriverState *bs,
2214 int cyls, int heads, int secs)
2215 {
2216 bs->cyls = cyls;
2217 bs->heads = heads;
2218 bs->secs = secs;
2219 }
2220
2221 void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
2222 {
2223 bs->translation = translation;
2224 }
2225
2226 void bdrv_get_geometry_hint(BlockDriverState *bs,
2227 int *pcyls, int *pheads, int *psecs)
2228 {
2229 *pcyls = bs->cyls;
2230 *pheads = bs->heads;
2231 *psecs = bs->secs;
2232 }
2233
2234 /* throttling disk io limits */
2235 void bdrv_set_io_limits(BlockDriverState *bs,
2236 BlockIOLimit *io_limits)
2237 {
2238 bs->io_limits = *io_limits;
2239 bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2240 }
2241
2242 /* Recognize floppy formats */
2243 typedef struct FDFormat {
2244 FDriveType drive;
2245 uint8_t last_sect;
2246 uint8_t max_track;
2247 uint8_t max_head;
2248 FDriveRate rate;
2249 } FDFormat;
2250
2251 static const FDFormat fd_formats[] = {
2252 /* First entry is default format */
2253 /* 1.44 MB 3"1/2 floppy disks */
2254 { FDRIVE_DRV_144, 18, 80, 1, FDRIVE_RATE_500K, },
2255 { FDRIVE_DRV_144, 20, 80, 1, FDRIVE_RATE_500K, },
2256 { FDRIVE_DRV_144, 21, 80, 1, FDRIVE_RATE_500K, },
2257 { FDRIVE_DRV_144, 21, 82, 1, FDRIVE_RATE_500K, },
2258 { FDRIVE_DRV_144, 21, 83, 1, FDRIVE_RATE_500K, },
2259 { FDRIVE_DRV_144, 22, 80, 1, FDRIVE_RATE_500K, },
2260 { FDRIVE_DRV_144, 23, 80, 1, FDRIVE_RATE_500K, },
2261 { FDRIVE_DRV_144, 24, 80, 1, FDRIVE_RATE_500K, },
2262 /* 2.88 MB 3"1/2 floppy disks */
2263 { FDRIVE_DRV_288, 36, 80, 1, FDRIVE_RATE_1M, },
2264 { FDRIVE_DRV_288, 39, 80, 1, FDRIVE_RATE_1M, },
2265 { FDRIVE_DRV_288, 40, 80, 1, FDRIVE_RATE_1M, },
2266 { FDRIVE_DRV_288, 44, 80, 1, FDRIVE_RATE_1M, },
2267 { FDRIVE_DRV_288, 48, 80, 1, FDRIVE_RATE_1M, },
2268 /* 720 kB 3"1/2 floppy disks */
2269 { FDRIVE_DRV_144, 9, 80, 1, FDRIVE_RATE_250K, },
2270 { FDRIVE_DRV_144, 10, 80, 1, FDRIVE_RATE_250K, },
2271 { FDRIVE_DRV_144, 10, 82, 1, FDRIVE_RATE_250K, },
2272 { FDRIVE_DRV_144, 10, 83, 1, FDRIVE_RATE_250K, },
2273 { FDRIVE_DRV_144, 13, 80, 1, FDRIVE_RATE_250K, },
2274 { FDRIVE_DRV_144, 14, 80, 1, FDRIVE_RATE_250K, },
2275 /* 1.2 MB 5"1/4 floppy disks */
2276 { FDRIVE_DRV_120, 15, 80, 1, FDRIVE_RATE_500K, },
2277 { FDRIVE_DRV_120, 18, 80, 1, FDRIVE_RATE_500K, },
2278 { FDRIVE_DRV_120, 18, 82, 1, FDRIVE_RATE_500K, },
2279 { FDRIVE_DRV_120, 18, 83, 1, FDRIVE_RATE_500K, },
2280 { FDRIVE_DRV_120, 20, 80, 1, FDRIVE_RATE_500K, },
2281 /* 720 kB 5"1/4 floppy disks */
2282 { FDRIVE_DRV_120, 9, 80, 1, FDRIVE_RATE_250K, },
2283 { FDRIVE_DRV_120, 11, 80, 1, FDRIVE_RATE_250K, },
2284 /* 360 kB 5"1/4 floppy disks */
2285 { FDRIVE_DRV_120, 9, 40, 1, FDRIVE_RATE_300K, },
2286 { FDRIVE_DRV_120, 9, 40, 0, FDRIVE_RATE_300K, },
2287 { FDRIVE_DRV_120, 10, 41, 1, FDRIVE_RATE_300K, },
2288 { FDRIVE_DRV_120, 10, 42, 1, FDRIVE_RATE_300K, },
2289 /* 320 kB 5"1/4 floppy disks */
2290 { FDRIVE_DRV_120, 8, 40, 1, FDRIVE_RATE_250K, },
2291 { FDRIVE_DRV_120, 8, 40, 0, FDRIVE_RATE_250K, },
2292 /* 360 kB must match 5"1/4 better than 3"1/2... */
2293 { FDRIVE_DRV_144, 9, 80, 0, FDRIVE_RATE_250K, },
2294 /* end */
2295 { FDRIVE_DRV_NONE, -1, -1, 0, 0, },
2296 };
2297
2298 void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
2299 int *max_track, int *last_sect,
2300 FDriveType drive_in, FDriveType *drive,
2301 FDriveRate *rate)
2302 {
2303 const FDFormat *parse;
2304 uint64_t nb_sectors, size;
2305 int i, first_match, match;
2306
2307 bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
2308 if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
2309 /* User defined disk */
2310 *rate = FDRIVE_RATE_500K;
2311 } else {
2312 bdrv_get_geometry(bs, &nb_sectors);
2313 match = -1;
2314 first_match = -1;
2315 for (i = 0; ; i++) {
2316 parse = &fd_formats[i];
2317 if (parse->drive == FDRIVE_DRV_NONE) {
2318 break;
2319 }
2320 if (drive_in == parse->drive ||
2321 drive_in == FDRIVE_DRV_NONE) {
2322 size = (parse->max_head + 1) * parse->max_track *
2323 parse->last_sect;
2324 if (nb_sectors == size) {
2325 match = i;
2326 break;
2327 }
2328 if (first_match == -1) {
2329 first_match = i;
2330 }
2331 }
2332 }
2333 if (match == -1) {
2334 if (first_match == -1) {
2335 match = 1;
2336 } else {
2337 match = first_match;
2338 }
2339 parse = &fd_formats[match];
2340 }
2341 *nb_heads = parse->max_head + 1;
2342 *max_track = parse->max_track;
2343 *last_sect = parse->last_sect;
2344 *drive = parse->drive;
2345 *rate = parse->rate;
2346 }
2347 }
2348
2349 int bdrv_get_translation_hint(BlockDriverState *bs)
2350 {
2351 return bs->translation;
2352 }
2353
2354 void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2355 BlockErrorAction on_write_error)
2356 {
2357 bs->on_read_error = on_read_error;
2358 bs->on_write_error = on_write_error;
2359 }
2360
2361 BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2362 {
2363 return is_read ? bs->on_read_error : bs->on_write_error;
2364 }
2365
2366 int bdrv_is_read_only(BlockDriverState *bs)
2367 {
2368 return bs->read_only;
2369 }
2370
2371 int bdrv_is_sg(BlockDriverState *bs)
2372 {
2373 return bs->sg;
2374 }
2375
2376 int bdrv_enable_write_cache(BlockDriverState *bs)
2377 {
2378 return bs->enable_write_cache;
2379 }
2380
2381 int bdrv_is_encrypted(BlockDriverState *bs)
2382 {
2383 if (bs->backing_hd && bs->backing_hd->encrypted)
2384 return 1;
2385 return bs->encrypted;
2386 }
2387
2388 int bdrv_key_required(BlockDriverState *bs)
2389 {
2390 BlockDriverState *backing_hd = bs->backing_hd;
2391
2392 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2393 return 1;
2394 return (bs->encrypted && !bs->valid_key);
2395 }
2396
2397 int bdrv_set_key(BlockDriverState *bs, const char *key)
2398 {
2399 int ret;
2400 if (bs->backing_hd && bs->backing_hd->encrypted) {
2401 ret = bdrv_set_key(bs->backing_hd, key);
2402 if (ret < 0)
2403 return ret;
2404 if (!bs->encrypted)
2405 return 0;
2406 }
2407 if (!bs->encrypted) {
2408 return -EINVAL;
2409 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2410 return -ENOMEDIUM;
2411 }
2412 ret = bs->drv->bdrv_set_key(bs, key);
2413 if (ret < 0) {
2414 bs->valid_key = 0;
2415 } else if (!bs->valid_key) {
2416 bs->valid_key = 1;
2417 /* call the change callback now, we skipped it on open */
2418 bdrv_dev_change_media_cb(bs, true);
2419 }
2420 return ret;
2421 }
2422
2423 void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
2424 {
2425 if (!bs->drv) {
2426 buf[0] = '\0';
2427 } else {
2428 pstrcpy(buf, buf_size, bs->drv->format_name);
2429 }
2430 }
2431
2432 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2433 void *opaque)
2434 {
2435 BlockDriver *drv;
2436
2437 QLIST_FOREACH(drv, &bdrv_drivers, list) {
2438 it(opaque, drv->format_name);
2439 }
2440 }
2441
2442 BlockDriverState *bdrv_find(const char *name)
2443 {
2444 BlockDriverState *bs;
2445
2446 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2447 if (!strcmp(name, bs->device_name)) {
2448 return bs;
2449 }
2450 }
2451 return NULL;
2452 }
2453
2454 BlockDriverState *bdrv_next(BlockDriverState *bs)
2455 {
2456 if (!bs) {
2457 return QTAILQ_FIRST(&bdrv_states);
2458 }
2459 return QTAILQ_NEXT(bs, list);
2460 }
2461
2462 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2463 {
2464 BlockDriverState *bs;
2465
2466 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2467 it(opaque, bs);
2468 }
2469 }
2470
2471 const char *bdrv_get_device_name(BlockDriverState *bs)
2472 {
2473 return bs->device_name;
2474 }
2475
2476 int bdrv_get_flags(BlockDriverState *bs)
2477 {
2478 return bs->open_flags;
2479 }
2480
2481 void bdrv_flush_all(void)
2482 {
2483 BlockDriverState *bs;
2484
2485 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2486 bdrv_flush(bs);
2487 }
2488 }
2489
2490 int bdrv_has_zero_init(BlockDriverState *bs)
2491 {
2492 assert(bs->drv);
2493
2494 if (bs->drv->bdrv_has_zero_init) {
2495 return bs->drv->bdrv_has_zero_init(bs);
2496 }
2497
2498 return 1;
2499 }
2500
2501 typedef struct BdrvCoIsAllocatedData {
2502 BlockDriverState *bs;
2503 int64_t sector_num;
2504 int nb_sectors;
2505 int *pnum;
2506 int ret;
2507 bool done;
2508 } BdrvCoIsAllocatedData;
2509
2510 /*
2511 * Returns true iff the specified sector is present in the disk image. Drivers
2512 * not implementing the functionality are assumed to not support backing files,
2513 * hence all their sectors are reported as allocated.
2514 *
2515 * If 'sector_num' is beyond the end of the disk image the return value is 0
2516 * and 'pnum' is set to 0.
2517 *
2518 * 'pnum' is set to the number of sectors (including and immediately following
2519 * the specified sector) that are known to be in the same
2520 * allocated/unallocated state.
2521 *
2522 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
2523 * beyond the end of the disk image it will be clamped.
2524 */
2525 int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2526 int nb_sectors, int *pnum)
2527 {
2528 int64_t n;
2529
2530 if (sector_num >= bs->total_sectors) {
2531 *pnum = 0;
2532 return 0;
2533 }
2534
2535 n = bs->total_sectors - sector_num;
2536 if (n < nb_sectors) {
2537 nb_sectors = n;
2538 }
2539
2540 if (!bs->drv->bdrv_co_is_allocated) {
2541 *pnum = nb_sectors;
2542 return 1;
2543 }
2544
2545 return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2546 }
2547
2548 /* Coroutine wrapper for bdrv_is_allocated() */
2549 static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2550 {
2551 BdrvCoIsAllocatedData *data = opaque;
2552 BlockDriverState *bs = data->bs;
2553
2554 data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2555 data->pnum);
2556 data->done = true;
2557 }
2558
2559 /*
2560 * Synchronous wrapper around bdrv_co_is_allocated().
2561 *
2562 * See bdrv_co_is_allocated() for details.
2563 */
2564 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2565 int *pnum)
2566 {
2567 Coroutine *co;
2568 BdrvCoIsAllocatedData data = {
2569 .bs = bs,
2570 .sector_num = sector_num,
2571 .nb_sectors = nb_sectors,
2572 .pnum = pnum,
2573 .done = false,
2574 };
2575
2576 co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2577 qemu_coroutine_enter(co, &data);
2578 while (!data.done) {
2579 qemu_aio_wait();
2580 }
2581 return data.ret;
2582 }
2583
2584 /*
2585 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
2586 *
2587 * Return true if the given sector is allocated in any image between
2588 * BASE and TOP (inclusive). BASE can be NULL to check if the given
2589 * sector is allocated in any image of the chain. Return false otherwise.
2590 *
2591 * 'pnum' is set to the number of sectors (including and immediately following
2592 * the specified sector) that are known to be in the same
2593 * allocated/unallocated state.
2594 *
2595 */
2596 int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
2597 BlockDriverState *base,
2598 int64_t sector_num,
2599 int nb_sectors, int *pnum)
2600 {
2601 BlockDriverState *intermediate;
2602 int ret, n = nb_sectors;
2603
2604 intermediate = top;
2605 while (intermediate && intermediate != base) {
2606 int pnum_inter;
2607 ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors,
2608 &pnum_inter);
2609 if (ret < 0) {
2610 return ret;
2611 } else if (ret) {
2612 *pnum = pnum_inter;
2613 return 1;
2614 }
2615
2616 /*
2617 * [sector_num, nb_sectors] is unallocated on top but intermediate
2618 * might have
2619 *
2620 * [sector_num+x, nr_sectors] allocated.
2621 */
2622 if (n > pnum_inter) {
2623 n = pnum_inter;
2624 }
2625
2626 intermediate = intermediate->backing_hd;
2627 }
2628
2629 *pnum = n;
2630 return 0;
2631 }
2632
2633 BlockInfoList *qmp_query_block(Error **errp)
2634 {
2635 BlockInfoList *head = NULL, *cur_item = NULL;
2636 BlockDriverState *bs;
2637
2638 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2639 BlockInfoList *info = g_malloc0(sizeof(*info));
2640
2641 info->value = g_malloc0(sizeof(*info->value));
2642 info->value->device = g_strdup(bs->device_name);
2643 info->value->type = g_strdup("unknown");
2644 info->value->locked = bdrv_dev_is_medium_locked(bs);
2645 info->value->removable = bdrv_dev_has_removable_media(bs);
2646
2647 if (bdrv_dev_has_removable_media(bs)) {
2648 info->value->has_tray_open = true;
2649 info->value->tray_open = bdrv_dev_is_tray_open(bs);
2650 }
2651
2652 if (bdrv_iostatus_is_enabled(bs)) {
2653 info->value->has_io_status = true;
2654 info->value->io_status = bs->iostatus;
2655 }
2656
2657 if (bs->drv) {
2658 info->value->has_inserted = true;
2659 info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2660 info->value->inserted->file = g_strdup(bs->filename);
2661 info->value->inserted->ro = bs->read_only;
2662 info->value->inserted->drv = g_strdup(bs->drv->format_name);
2663 info->value->inserted->encrypted = bs->encrypted;
2664 if (bs->backing_file[0]) {
2665 info->value->inserted->has_backing_file = true;
2666 info->value->inserted->backing_file = g_strdup(bs->backing_file);
2667 }
2668
2669 if (bs->io_limits_enabled) {
2670 info->value->inserted->bps =
2671 bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2672 info->value->inserted->bps_rd =
2673 bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2674 info->value->inserted->bps_wr =
2675 bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2676 info->value->inserted->iops =
2677 bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2678 info->value->inserted->iops_rd =
2679 bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2680 info->value->inserted->iops_wr =
2681 bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2682 }
2683 }
2684
2685 /* XXX: waiting for the qapi to support GSList */
2686 if (!cur_item) {
2687 head = cur_item = info;
2688 } else {
2689 cur_item->next = info;
2690 cur_item = info;
2691 }
2692 }
2693
2694 return head;
2695 }
2696
2697 /* Consider exposing this as a full fledged QMP command */
2698 static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2699 {
2700 BlockStats *s;
2701
2702 s = g_malloc0(sizeof(*s));
2703
2704 if (bs->device_name[0]) {
2705 s->has_device = true;
2706 s->device = g_strdup(bs->device_name);
2707 }
2708
2709 s->stats = g_malloc0(sizeof(*s->stats));
2710 s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2711 s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2712 s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2713 s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2714 s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2715 s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2716 s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2717 s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2718 s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2719
2720 if (bs->file) {
2721 s->has_parent = true;
2722 s->parent = qmp_query_blockstat(bs->file, NULL);
2723 }
2724
2725 return s;
2726 }
2727
2728 BlockStatsList *qmp_query_blockstats(Error **errp)
2729 {
2730 BlockStatsList *head = NULL, *cur_item = NULL;
2731 BlockDriverState *bs;
2732
2733 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2734 BlockStatsList *info = g_malloc0(sizeof(*info));
2735 info->value = qmp_query_blockstat(bs, NULL);
2736
2737 /* XXX: waiting for the qapi to support GSList */
2738 if (!cur_item) {
2739 head = cur_item = info;
2740 } else {
2741 cur_item->next = info;
2742 cur_item = info;
2743 }
2744 }
2745
2746 return head;
2747 }
2748
2749 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2750 {
2751 if (bs->backing_hd && bs->backing_hd->encrypted)
2752 return bs->backing_file;
2753 else if (bs->encrypted)
2754 return bs->filename;
2755 else
2756 return NULL;
2757 }
2758
2759 void bdrv_get_backing_filename(BlockDriverState *bs,
2760 char *filename, int filename_size)
2761 {
2762 pstrcpy(filename, filename_size, bs->backing_file);
2763 }
2764
2765 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2766 const uint8_t *buf, int nb_sectors)
2767 {
2768 BlockDriver *drv = bs->drv;
2769 if (!drv)
2770 return -ENOMEDIUM;
2771 if (!drv->bdrv_write_compressed)
2772 return -ENOTSUP;
2773 if (bdrv_check_request(bs, sector_num, nb_sectors))
2774 return -EIO;
2775
2776 if (bs->dirty_bitmap) {
2777 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2778 }
2779
2780 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2781 }
2782
2783 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2784 {
2785 BlockDriver *drv = bs->drv;
2786 if (!drv)
2787 return -ENOMEDIUM;
2788 if (!drv->bdrv_get_info)
2789 return -ENOTSUP;
2790 memset(bdi, 0, sizeof(*bdi));
2791 return drv->bdrv_get_info(bs, bdi);
2792 }
2793
2794 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2795 int64_t pos, int size)
2796 {
2797 BlockDriver *drv = bs->drv;
2798 if (!drv)
2799 return -ENOMEDIUM;
2800 if (drv->bdrv_save_vmstate)
2801 return drv->bdrv_save_vmstate(bs, buf, pos, size);
2802 if (bs->file)
2803 return bdrv_save_vmstate(bs->file, buf, pos, size);
2804 return -ENOTSUP;
2805 }
2806
2807 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2808 int64_t pos, int size)
2809 {
2810 BlockDriver *drv = bs->drv;
2811 if (!drv)
2812 return -ENOMEDIUM;
2813 if (drv->bdrv_load_vmstate)
2814 return drv->bdrv_load_vmstate(bs, buf, pos, size);
2815 if (bs->file)
2816 return bdrv_load_vmstate(bs->file, buf, pos, size);
2817 return -ENOTSUP;
2818 }
2819
2820 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2821 {
2822 BlockDriver *drv = bs->drv;
2823
2824 if (!drv || !drv->bdrv_debug_event) {
2825 return;
2826 }
2827
2828 return drv->bdrv_debug_event(bs, event);
2829
2830 }
2831
2832 /**************************************************************/
2833 /* handling of snapshots */
2834
2835 int bdrv_can_snapshot(BlockDriverState *bs)
2836 {
2837 BlockDriver *drv = bs->drv;
2838 if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2839 return 0;
2840 }
2841
2842 if (!drv->bdrv_snapshot_create) {
2843 if (bs->file != NULL) {
2844 return bdrv_can_snapshot(bs->file);
2845 }
2846 return 0;
2847 }
2848
2849 return 1;
2850 }
2851
2852 int bdrv_is_snapshot(BlockDriverState *bs)
2853 {
2854 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2855 }
2856
2857 BlockDriverState *bdrv_snapshots(void)
2858 {
2859 BlockDriverState *bs;
2860
2861 if (bs_snapshots) {
2862 return bs_snapshots;
2863 }
2864
2865 bs = NULL;
2866 while ((bs = bdrv_next(bs))) {
2867 if (bdrv_can_snapshot(bs)) {
2868 bs_snapshots = bs;
2869 return bs;
2870 }
2871 }
2872 return NULL;
2873 }
2874
2875 int bdrv_snapshot_create(BlockDriverState *bs,
2876 QEMUSnapshotInfo *sn_info)
2877 {
2878 BlockDriver *drv = bs->drv;
2879 if (!drv)
2880 return -ENOMEDIUM;
2881 if (drv->bdrv_snapshot_create)
2882 return drv->bdrv_snapshot_create(bs, sn_info);
2883 if (bs->file)
2884 return bdrv_snapshot_create(bs->file, sn_info);
2885 return -ENOTSUP;
2886 }
2887
2888 int bdrv_snapshot_goto(BlockDriverState *bs,
2889 const char *snapshot_id)
2890 {
2891 BlockDriver *drv = bs->drv;
2892 int ret, open_ret;
2893
2894 if (!drv)
2895 return -ENOMEDIUM;
2896 if (drv->bdrv_snapshot_goto)
2897 return drv->bdrv_snapshot_goto(bs, snapshot_id);
2898
2899 if (bs->file) {
2900 drv->bdrv_close(bs);
2901 ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2902 open_ret = drv->bdrv_open(bs, bs->open_flags);
2903 if (open_ret < 0) {
2904 bdrv_delete(bs->file);
2905 bs->drv = NULL;
2906 return open_ret;
2907 }
2908 return ret;
2909 }
2910
2911 return -ENOTSUP;
2912 }
2913
2914 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2915 {
2916 BlockDriver *drv = bs->drv;
2917 if (!drv)
2918 return -ENOMEDIUM;
2919 if (drv->bdrv_snapshot_delete)
2920 return drv->bdrv_snapshot_delete(bs, snapshot_id);
2921 if (bs->file)
2922 return bdrv_snapshot_delete(bs->file, snapshot_id);
2923 return -ENOTSUP;
2924 }
2925
2926 int bdrv_snapshot_list(BlockDriverState *bs,
2927 QEMUSnapshotInfo **psn_info)
2928 {
2929 BlockDriver *drv = bs->drv;
2930 if (!drv)
2931 return -ENOMEDIUM;
2932 if (drv->bdrv_snapshot_list)
2933 return drv->bdrv_snapshot_list(bs, psn_info);
2934 if (bs->file)
2935 return bdrv_snapshot_list(bs->file, psn_info);
2936 return -ENOTSUP;
2937 }
2938
2939 int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2940 const char *snapshot_name)
2941 {
2942 BlockDriver *drv = bs->drv;
2943 if (!drv) {
2944 return -ENOMEDIUM;
2945 }
2946 if (!bs->read_only) {
2947 return -EINVAL;
2948 }
2949 if (drv->bdrv_snapshot_load_tmp) {
2950 return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2951 }
2952 return -ENOTSUP;
2953 }
2954
2955 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2956 const char *backing_file)
2957 {
2958 if (!bs->drv) {
2959 return NULL;
2960 }
2961
2962 if (bs->backing_hd) {
2963 if (strcmp(bs->backing_file, backing_file) == 0) {
2964 return bs->backing_hd;
2965 } else {
2966 return bdrv_find_backing_image(bs->backing_hd, backing_file);
2967 }
2968 }
2969
2970 return NULL;
2971 }
2972
2973 #define NB_SUFFIXES 4
2974
2975 char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2976 {
2977 static const char suffixes[NB_SUFFIXES] = "KMGT";
2978 int64_t base;
2979 int i;
2980
2981 if (size <= 999) {
2982 snprintf(buf, buf_size, "%" PRId64, size);
2983 } else {
2984 base = 1024;
2985 for(i = 0; i < NB_SUFFIXES; i++) {
2986 if (size < (10 * base)) {
2987 snprintf(buf, buf_size, "%0.1f%c",
2988 (double)size / base,
2989 suffixes[i]);
2990 break;
2991 } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2992 snprintf(buf, buf_size, "%" PRId64 "%c",
2993 ((size + (base >> 1)) / base),
2994 suffixes[i]);
2995 break;
2996 }
2997 base = base * 1024;
2998 }
2999 }
3000 return buf;
3001 }
3002
3003 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
3004 {
3005 char buf1[128], date_buf[128], clock_buf[128];
3006 #ifdef _WIN32
3007 struct tm *ptm;
3008 #else
3009 struct tm tm;
3010 #endif
3011 time_t ti;
3012 int64_t secs;
3013
3014 if (!sn) {
3015 snprintf(buf, buf_size,
3016 "%-10s%-20s%7s%20s%15s",
3017 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
3018 } else {
3019 ti = sn->date_sec;
3020 #ifdef _WIN32
3021 ptm = localtime(&ti);
3022 strftime(date_buf, sizeof(date_buf),
3023 "%Y-%m-%d %H:%M:%S", ptm);
3024 #else
3025 localtime_r(&ti, &tm);
3026 strftime(date_buf, sizeof(date_buf),
3027 "%Y-%m-%d %H:%M:%S", &tm);
3028 #endif
3029 secs = sn->vm_clock_nsec / 1000000000;
3030 snprintf(clock_buf, sizeof(clock_buf),
3031 "%02d:%02d:%02d.%03d",
3032 (int)(secs / 3600),
3033 (int)((secs / 60) % 60),
3034 (int)(secs % 60),
3035 (int)((sn->vm_clock_nsec / 1000000) % 1000));
3036 snprintf(buf, buf_size,
3037 "%-10s%-20s%7s%20s%15s",
3038 sn->id_str, sn->name,
3039 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
3040 date_buf,
3041 clock_buf);
3042 }
3043 return buf;
3044 }
3045
3046 /**************************************************************/
3047 /* async I/Os */
3048
3049 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
3050 QEMUIOVector *qiov, int nb_sectors,
3051 BlockDriverCompletionFunc *cb, void *opaque)
3052 {
3053 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
3054
3055 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3056 cb, opaque, false);
3057 }
3058
3059 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
3060 QEMUIOVector *qiov, int nb_sectors,
3061 BlockDriverCompletionFunc *cb, void *opaque)
3062 {
3063 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
3064
3065 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3066 cb, opaque, true);
3067 }
3068
3069
3070 typedef struct MultiwriteCB {
3071 int error;
3072 int num_requests;
3073 int num_callbacks;
3074 struct {
3075 BlockDriverCompletionFunc *cb;
3076 void *opaque;
3077 QEMUIOVector *free_qiov;
3078 } callbacks[];
3079 } MultiwriteCB;
3080
3081 static void multiwrite_user_cb(MultiwriteCB *mcb)
3082 {
3083 int i;
3084
3085 for (i = 0; i < mcb->num_callbacks; i++) {
3086 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
3087 if (mcb->callbacks[i].free_qiov) {
3088 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3089 }
3090 g_free(mcb->callbacks[i].free_qiov);
3091 }
3092 }
3093
3094 static void multiwrite_cb(void *opaque, int ret)
3095 {
3096 MultiwriteCB *mcb = opaque;
3097
3098 trace_multiwrite_cb(mcb, ret);
3099
3100 if (ret < 0 && !mcb->error) {
3101 mcb->error = ret;
3102 }
3103
3104 mcb->num_requests--;
3105 if (mcb->num_requests == 0) {
3106 multiwrite_user_cb(mcb);
3107 g_free(mcb);
3108 }
3109 }
3110
3111 static int multiwrite_req_compare(const void *a, const void *b)
3112 {
3113 const BlockRequest *req1 = a, *req2 = b;
3114
3115 /*
3116 * Note that we can't simply subtract req2->sector from req1->sector
3117 * here as that could overflow the return value.
3118 */
3119 if (req1->sector > req2->sector) {
3120 return 1;
3121 } else if (req1->sector < req2->sector) {
3122 return -1;
3123 } else {
3124 return 0;
3125 }
3126 }
3127
3128 /*
3129 * Takes a bunch of requests and tries to merge them. Returns the number of
3130 * requests that remain after merging.
3131 */
3132 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3133 int num_reqs, MultiwriteCB *mcb)
3134 {
3135 int i, outidx;
3136
3137 // Sort requests by start sector
3138 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3139
3140 // Check if adjacent requests touch the same clusters. If so, combine them,
3141 // filling up gaps with zero sectors.
3142 outidx = 0;
3143 for (i = 1; i < num_reqs; i++) {
3144 int merge = 0;
3145 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3146
3147 // Handle exactly sequential writes and overlapping writes.
3148 if (reqs[i].sector <= oldreq_last) {
3149 merge = 1;
3150 }
3151
3152 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3153 merge = 0;
3154 }
3155
3156 if (merge) {
3157 size_t size;
3158 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3159 qemu_iovec_init(qiov,
3160 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3161
3162 // Add the first request to the merged one. If the requests are
3163 // overlapping, drop the last sectors of the first request.
3164 size = (reqs[i].sector - reqs[outidx].sector) << 9;
3165 qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
3166
3167 // We should need to add any zeros between the two requests
3168 assert (reqs[i].sector <= oldreq_last);
3169
3170 // Add the second request
3171 qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
3172
3173 reqs[outidx].nb_sectors = qiov->size >> 9;
3174 reqs[outidx].qiov = qiov;
3175
3176 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3177 } else {
3178 outidx++;
3179 reqs[outidx].sector = reqs[i].sector;
3180 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3181 reqs[outidx].qiov = reqs[i].qiov;
3182 }
3183 }
3184
3185 return outidx + 1;
3186 }
3187
3188 /*
3189 * Submit multiple AIO write requests at once.
3190 *
3191 * On success, the function returns 0 and all requests in the reqs array have
3192 * been submitted. In error case this function returns -1, and any of the
3193 * requests may or may not be submitted yet. In particular, this means that the
3194 * callback will be called for some of the requests, for others it won't. The
3195 * caller must check the error field of the BlockRequest to wait for the right
3196 * callbacks (if error != 0, no callback will be called).
3197 *
3198 * The implementation may modify the contents of the reqs array, e.g. to merge
3199 * requests. However, the fields opaque and error are left unmodified as they
3200 * are used to signal failure for a single request to the caller.
3201 */
3202 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3203 {
3204 MultiwriteCB *mcb;
3205 int i;
3206
3207 /* don't submit writes if we don't have a medium */
3208 if (bs->drv == NULL) {
3209 for (i = 0; i < num_reqs; i++) {
3210 reqs[i].error = -ENOMEDIUM;
3211 }
3212 return -1;
3213 }
3214
3215 if (num_reqs == 0) {
3216 return 0;
3217 }
3218
3219 // Create MultiwriteCB structure
3220 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3221 mcb->num_requests = 0;
3222 mcb->num_callbacks = num_reqs;
3223
3224 for (i = 0; i < num_reqs; i++) {
3225 mcb->callbacks[i].cb = reqs[i].cb;
3226 mcb->callbacks[i].opaque = reqs[i].opaque;
3227 }
3228
3229 // Check for mergable requests
3230 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3231
3232 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3233
3234 /* Run the aio requests. */
3235 mcb->num_requests = num_reqs;
3236 for (i = 0; i < num_reqs; i++) {
3237 bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3238 reqs[i].nb_sectors, multiwrite_cb, mcb);
3239 }
3240
3241 return 0;
3242 }
3243
3244 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3245 {
3246 acb->pool->cancel(acb);
3247 }
3248
3249 /* block I/O throttling */
3250 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3251 bool is_write, double elapsed_time, uint64_t *wait)
3252 {
3253 uint64_t bps_limit = 0;
3254 double bytes_limit, bytes_base, bytes_res;
3255 double slice_time, wait_time;
3256
3257 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3258 bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3259 } else if (bs->io_limits.bps[is_write]) {
3260 bps_limit = bs->io_limits.bps[is_write];
3261 } else {
3262 if (wait) {
3263 *wait = 0;
3264 }
3265
3266 return false;
3267 }
3268
3269 slice_time = bs->slice_end - bs->slice_start;
3270 slice_time /= (NANOSECONDS_PER_SECOND);
3271 bytes_limit = bps_limit * slice_time;
3272 bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3273 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3274 bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3275 }
3276
3277 /* bytes_base: the bytes of data which have been read/written; and
3278 * it is obtained from the history statistic info.
3279 * bytes_res: the remaining bytes of data which need to be read/written.
3280 * (bytes_base + bytes_res) / bps_limit: used to calcuate
3281 * the total time for completing reading/writting all data.
3282 */
3283 bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3284
3285 if (bytes_base + bytes_res <= bytes_limit) {
3286 if (wait) {
3287 *wait = 0;
3288 }
3289
3290 return false;
3291 }
3292
3293 /* Calc approx time to dispatch */
3294 wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3295
3296 /* When the I/O rate at runtime exceeds the limits,
3297 * bs->slice_end need to be extended in order that the current statistic
3298 * info can be kept until the timer fire, so it is increased and tuned
3299 * based on the result of experiment.
3300 */
3301 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3302 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3303 if (wait) {
3304 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3305 }
3306
3307 return true;
3308 }
3309
3310 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3311 double elapsed_time, uint64_t *wait)
3312 {
3313 uint64_t iops_limit = 0;
3314 double ios_limit, ios_base;
3315 double slice_time, wait_time;
3316
3317 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3318 iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3319 } else if (bs->io_limits.iops[is_write]) {
3320 iops_limit = bs->io_limits.iops[is_write];
3321 } else {
3322 if (wait) {
3323 *wait = 0;
3324 }
3325
3326 return false;
3327 }
3328
3329 slice_time = bs->slice_end - bs->slice_start;
3330 slice_time /= (NANOSECONDS_PER_SECOND);
3331 ios_limit = iops_limit * slice_time;
3332 ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3333 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3334 ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3335 }
3336
3337 if (ios_base + 1 <= ios_limit) {
3338 if (wait) {
3339 *wait = 0;
3340 }
3341
3342 return false;
3343 }
3344
3345 /* Calc approx time to dispatch */
3346 wait_time = (ios_base + 1) / iops_limit;
3347 if (wait_time > elapsed_time) {
3348 wait_time = wait_time - elapsed_time;
3349 } else {
3350 wait_time = 0;
3351 }
3352
3353 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3354 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3355 if (wait) {
3356 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3357 }
3358
3359 return true;
3360 }
3361
3362 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3363 bool is_write, int64_t *wait)
3364 {
3365 int64_t now, max_wait;
3366 uint64_t bps_wait = 0, iops_wait = 0;
3367 double elapsed_time;
3368 int bps_ret, iops_ret;
3369
3370 now = qemu_get_clock_ns(vm_clock);
3371 if ((bs->slice_start < now)
3372 && (bs->slice_end > now)) {
3373 bs->slice_end = now + bs->slice_time;
3374 } else {
3375 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
3376 bs->slice_start = now;
3377 bs->slice_end = now + bs->slice_time;
3378
3379 bs->io_base.bytes[is_write] = bs->nr_bytes[is_write];
3380 bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3381
3382 bs->io_base.ios[is_write] = bs->nr_ops[is_write];
3383 bs->io_base.ios[!is_write] = bs->nr_ops[!is_write];
3384 }
3385
3386 elapsed_time = now - bs->slice_start;
3387 elapsed_time /= (NANOSECONDS_PER_SECOND);
3388
3389 bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors,
3390 is_write, elapsed_time, &bps_wait);
3391 iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3392 elapsed_time, &iops_wait);
3393 if (bps_ret || iops_ret) {
3394 max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3395 if (wait) {
3396 *wait = max_wait;
3397 }
3398
3399 now = qemu_get_clock_ns(vm_clock);
3400 if (bs->slice_end < now + max_wait) {
3401 bs->slice_end = now + max_wait;
3402 }
3403
3404 return true;
3405 }
3406
3407 if (wait) {
3408 *wait = 0;
3409 }
3410
3411 return false;
3412 }
3413
3414 /**************************************************************/
3415 /* async block device emulation */
3416
3417 typedef struct BlockDriverAIOCBSync {
3418 BlockDriverAIOCB common;
3419 QEMUBH *bh;
3420 int ret;
3421 /* vector translation state */
3422 QEMUIOVector *qiov;
3423 uint8_t *bounce;
3424 int is_write;
3425 } BlockDriverAIOCBSync;
3426
3427 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3428 {
3429 BlockDriverAIOCBSync *acb =
3430 container_of(blockacb, BlockDriverAIOCBSync, common);
3431 qemu_bh_delete(acb->bh);
3432 acb->bh = NULL;
3433 qemu_aio_release(acb);
3434 }
3435
3436 static AIOPool bdrv_em_aio_pool = {
3437 .aiocb_size = sizeof(BlockDriverAIOCBSync),
3438 .cancel = bdrv_aio_cancel_em,
3439 };
3440
3441 static void bdrv_aio_bh_cb(void *opaque)
3442 {
3443 BlockDriverAIOCBSync *acb = opaque;
3444
3445 if (!acb->is_write)
3446 qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
3447 qemu_vfree(acb->bounce);
3448 acb->common.cb(acb->common.opaque, acb->ret);
3449 qemu_bh_delete(acb->bh);
3450 acb->bh = NULL;
3451 qemu_aio_release(acb);
3452 }
3453
3454 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3455 int64_t sector_num,
3456 QEMUIOVector *qiov,
3457 int nb_sectors,
3458 BlockDriverCompletionFunc *cb,
3459 void *opaque,
3460 int is_write)
3461
3462 {
3463 BlockDriverAIOCBSync *acb;
3464
3465 acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
3466 acb->is_write = is_write;
3467 acb->qiov = qiov;
3468 acb->bounce = qemu_blockalign(bs, qiov->size);
3469 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3470
3471 if (is_write) {
3472 qemu_iovec_to_buffer(acb->qiov, acb->bounce);
3473 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3474 } else {
3475 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3476 }
3477
3478 qemu_bh_schedule(acb->bh);
3479
3480 return &acb->common;
3481 }
3482
3483 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3484 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3485 BlockDriverCompletionFunc *cb, void *opaque)
3486 {
3487 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3488 }
3489
3490 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3491 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3492 BlockDriverCompletionFunc *cb, void *opaque)
3493 {
3494 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3495 }
3496
3497
3498 typedef struct BlockDriverAIOCBCoroutine {
3499 BlockDriverAIOCB common;
3500 BlockRequest req;
3501 bool is_write;
3502 QEMUBH* bh;
3503 } BlockDriverAIOCBCoroutine;
3504
3505 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3506 {
3507 qemu_aio_flush();
3508 }
3509
3510 static AIOPool bdrv_em_co_aio_pool = {
3511 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
3512 .cancel = bdrv_aio_co_cancel_em,
3513 };
3514
3515 static void bdrv_co_em_bh(void *opaque)
3516 {
3517 BlockDriverAIOCBCoroutine *acb = opaque;
3518
3519 acb->common.cb(acb->common.opaque, acb->req.error);
3520 qemu_bh_delete(acb->bh);
3521 qemu_aio_release(acb);
3522 }
3523
3524 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3525 static void coroutine_fn bdrv_co_do_rw(void *opaque)
3526 {
3527 BlockDriverAIOCBCoroutine *acb = opaque;
3528 BlockDriverState *bs = acb->common.bs;
3529
3530 if (!acb->is_write) {
3531 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3532 acb->req.nb_sectors, acb->req.qiov, 0);
3533 } else {
3534 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3535 acb->req.nb_sectors, acb->req.qiov, 0);
3536 }
3537
3538 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3539 qemu_bh_schedule(acb->bh);
3540 }
3541
3542 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3543 int64_t sector_num,
3544 QEMUIOVector *qiov,
3545 int nb_sectors,
3546 BlockDriverCompletionFunc *cb,
3547 void *opaque,
3548 bool is_write)
3549 {
3550 Coroutine *co;
3551 BlockDriverAIOCBCoroutine *acb;
3552
3553 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3554 acb->req.sector = sector_num;
3555 acb->req.nb_sectors = nb_sectors;
3556 acb->req.qiov = qiov;
3557 acb->is_write = is_write;
3558
3559 co = qemu_coroutine_create(bdrv_co_do_rw);
3560 qemu_coroutine_enter(co, acb);
3561
3562 return &acb->common;
3563 }
3564
3565 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3566 {
3567 BlockDriverAIOCBCoroutine *acb = opaque;
3568 BlockDriverState *bs = acb->common.bs;
3569
3570 acb->req.error = bdrv_co_flush(bs);
3571 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3572 qemu_bh_schedule(acb->bh);
3573 }
3574
3575 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3576 BlockDriverCompletionFunc *cb, void *opaque)
3577 {
3578 trace_bdrv_aio_flush(bs, opaque);
3579
3580 Coroutine *co;
3581 BlockDriverAIOCBCoroutine *acb;
3582
3583 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3584 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3585 qemu_coroutine_enter(co, acb);
3586
3587 return &acb->common;
3588 }
3589
3590 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3591 {
3592 BlockDriverAIOCBCoroutine *acb = opaque;
3593 BlockDriverState *bs = acb->common.bs;
3594
3595 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3596 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3597 qemu_bh_schedule(acb->bh);
3598 }
3599
3600 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3601 int64_t sector_num, int nb_sectors,
3602 BlockDriverCompletionFunc *cb, void *opaque)
3603 {
3604 Coroutine *co;
3605 BlockDriverAIOCBCoroutine *acb;
3606
3607 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3608
3609 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3610 acb->req.sector = sector_num;
3611 acb->req.nb_sectors = nb_sectors;
3612 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3613 qemu_coroutine_enter(co, acb);
3614
3615 return &acb->common;
3616 }
3617
3618 void bdrv_init(void)
3619 {
3620 module_call_init(MODULE_INIT_BLOCK);
3621 }
3622
3623 void bdrv_init_with_whitelist(void)
3624 {
3625 use_bdrv_whitelist = 1;
3626 bdrv_init();
3627 }
3628
3629 void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3630 BlockDriverCompletionFunc *cb, void *opaque)
3631 {
3632 BlockDriverAIOCB *acb;
3633
3634 if (pool->free_aiocb) {
3635 acb = pool->free_aiocb;
3636 pool->free_aiocb = acb->next;
3637 } else {
3638 acb = g_malloc0(pool->aiocb_size);
3639 acb->pool = pool;
3640 }
3641 acb->bs = bs;
3642 acb->cb = cb;
3643 acb->opaque = opaque;
3644 return acb;
3645 }
3646
3647 void qemu_aio_release(void *p)
3648 {
3649 BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3650 AIOPool *pool = acb->pool;
3651 acb->next = pool->free_aiocb;
3652 pool->free_aiocb = acb;
3653 }
3654
3655 /**************************************************************/
3656 /* Coroutine block device emulation */
3657
3658 typedef struct CoroutineIOCompletion {
3659 Coroutine *coroutine;
3660 int ret;
3661 } CoroutineIOCompletion;
3662
3663 static void bdrv_co_io_em_complete(void *opaque, int ret)
3664 {
3665 CoroutineIOCompletion *co = opaque;
3666
3667 co->ret = ret;
3668 qemu_coroutine_enter(co->coroutine, NULL);
3669 }
3670
3671 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3672 int nb_sectors, QEMUIOVector *iov,
3673 bool is_write)
3674 {
3675 CoroutineIOCompletion co = {
3676 .coroutine = qemu_coroutine_self(),
3677 };
3678 BlockDriverAIOCB *acb;
3679
3680 if (is_write) {
3681 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3682 bdrv_co_io_em_complete, &co);
3683 } else {
3684 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3685 bdrv_co_io_em_complete, &co);
3686 }
3687
3688 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3689 if (!acb) {
3690 return -EIO;
3691 }
3692 qemu_coroutine_yield();
3693
3694 return co.ret;
3695 }
3696
3697 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3698 int64_t sector_num, int nb_sectors,
3699 QEMUIOVector *iov)
3700 {
3701 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3702 }
3703
3704 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3705 int64_t sector_num, int nb_sectors,
3706 QEMUIOVector *iov)
3707 {
3708 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3709 }
3710
3711 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3712 {
3713 RwCo *rwco = opaque;
3714
3715 rwco->ret = bdrv_co_flush(rwco->bs);
3716 }
3717
3718 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3719 {
3720 int ret;
3721
3722 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3723 return 0;
3724 }
3725
3726 /* Write back cached data to the OS even with cache=unsafe */
3727 if (bs->drv->bdrv_co_flush_to_os) {
3728 ret = bs->drv->bdrv_co_flush_to_os(bs);
3729 if (ret < 0) {
3730 return ret;
3731 }
3732 }
3733
3734 /* But don't actually force it to the disk with cache=unsafe */
3735 if (bs->open_flags & BDRV_O_NO_FLUSH) {
3736 return 0;
3737 }
3738
3739 if (bs->drv->bdrv_co_flush_to_disk) {
3740 ret = bs->drv->bdrv_co_flush_to_disk(bs);
3741 } else if (bs->drv->bdrv_aio_flush) {
3742 BlockDriverAIOCB *acb;
3743 CoroutineIOCompletion co = {
3744 .coroutine = qemu_coroutine_self(),
3745 };
3746
3747 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3748 if (acb == NULL) {
3749 ret = -EIO;
3750 } else {
3751 qemu_coroutine_yield();
3752 ret = co.ret;
3753 }
3754 } else {
3755 /*
3756 * Some block drivers always operate in either writethrough or unsafe
3757 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3758 * know how the server works (because the behaviour is hardcoded or
3759 * depends on server-side configuration), so we can't ensure that
3760 * everything is safe on disk. Returning an error doesn't work because
3761 * that would break guests even if the server operates in writethrough
3762 * mode.
3763 *
3764 * Let's hope the user knows what he's doing.
3765 */
3766 ret = 0;
3767 }
3768 if (ret < 0) {
3769 return ret;
3770 }
3771
3772 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
3773 * in the case of cache=unsafe, so there are no useless flushes.
3774 */
3775 return bdrv_co_flush(bs->file);
3776 }
3777
3778 void bdrv_invalidate_cache(BlockDriverState *bs)
3779 {
3780 if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3781 bs->drv->bdrv_invalidate_cache(bs);
3782 }
3783 }
3784
3785 void bdrv_invalidate_cache_all(void)
3786 {
3787 BlockDriverState *bs;
3788
3789 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3790 bdrv_invalidate_cache(bs);
3791 }
3792 }
3793
3794 void bdrv_clear_incoming_migration_all(void)
3795 {
3796 BlockDriverState *bs;
3797
3798 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3799 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
3800 }
3801 }
3802
3803 int bdrv_flush(BlockDriverState *bs)
3804 {
3805 Coroutine *co;
3806 RwCo rwco = {
3807 .bs = bs,
3808 .ret = NOT_DONE,
3809 };
3810
3811 if (qemu_in_coroutine()) {
3812 /* Fast-path if already in coroutine context */
3813 bdrv_flush_co_entry(&rwco);
3814 } else {
3815 co = qemu_coroutine_create(bdrv_flush_co_entry);
3816 qemu_coroutine_enter(co, &rwco);
3817 while (rwco.ret == NOT_DONE) {
3818 qemu_aio_wait();
3819 }
3820 }
3821
3822 return rwco.ret;
3823 }
3824
3825 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3826 {
3827 RwCo *rwco = opaque;
3828
3829 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3830 }
3831
3832 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3833 int nb_sectors)
3834 {
3835 if (!bs->drv) {
3836 return -ENOMEDIUM;
3837 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3838 return -EIO;
3839 } else if (bs->read_only) {
3840 return -EROFS;
3841 } else if (bs->drv->bdrv_co_discard) {
3842 return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3843 } else if (bs->drv->bdrv_aio_discard) {
3844 BlockDriverAIOCB *acb;
3845 CoroutineIOCompletion co = {
3846 .coroutine = qemu_coroutine_self(),
3847 };
3848
3849 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3850 bdrv_co_io_em_complete, &co);
3851 if (acb == NULL) {
3852 return -EIO;
3853 } else {
3854 qemu_coroutine_yield();
3855 return co.ret;
3856 }
3857 } else {
3858 return 0;
3859 }
3860 }
3861
3862 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3863 {
3864 Coroutine *co;
3865 RwCo rwco = {
3866 .bs = bs,
3867 .sector_num = sector_num,
3868 .nb_sectors = nb_sectors,
3869 .ret = NOT_DONE,
3870 };
3871
3872 if (qemu_in_coroutine()) {
3873 /* Fast-path if already in coroutine context */
3874 bdrv_discard_co_entry(&rwco);
3875 } else {
3876 co = qemu_coroutine_create(bdrv_discard_co_entry);
3877 qemu_coroutine_enter(co, &rwco);
3878 while (rwco.ret == NOT_DONE) {
3879 qemu_aio_wait();
3880 }
3881 }
3882
3883 return rwco.ret;
3884 }
3885
3886 /**************************************************************/
3887 /* removable device support */
3888
3889 /**
3890 * Return TRUE if the media is present
3891 */
3892 int bdrv_is_inserted(BlockDriverState *bs)
3893 {
3894 BlockDriver *drv = bs->drv;
3895
3896 if (!drv)
3897 return 0;
3898 if (!drv->bdrv_is_inserted)
3899 return 1;
3900 return drv->bdrv_is_inserted(bs);
3901 }
3902
3903 /**
3904 * Return whether the media changed since the last call to this
3905 * function, or -ENOTSUP if we don't know. Most drivers don't know.
3906 */
3907 int bdrv_media_changed(BlockDriverState *bs)
3908 {
3909 BlockDriver *drv = bs->drv;
3910
3911 if (drv && drv->bdrv_media_changed) {
3912 return drv->bdrv_media_changed(bs);
3913 }
3914 return -ENOTSUP;
3915 }
3916
3917 /**
3918 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3919 */
3920 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
3921 {
3922 BlockDriver *drv = bs->drv;
3923
3924 if (drv && drv->bdrv_eject) {
3925 drv->bdrv_eject(bs, eject_flag);
3926 }
3927
3928 if (bs->device_name[0] != '\0') {
3929 bdrv_emit_qmp_eject_event(bs, eject_flag);
3930 }
3931 }
3932
3933 /**
3934 * Lock or unlock the media (if it is locked, the user won't be able
3935 * to eject it manually).
3936 */
3937 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3938 {
3939 BlockDriver *drv = bs->drv;
3940
3941 trace_bdrv_lock_medium(bs, locked);
3942
3943 if (drv && drv->bdrv_lock_medium) {
3944 drv->bdrv_lock_medium(bs, locked);
3945 }
3946 }
3947
3948 /* needed for generic scsi interface */
3949
3950 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3951 {
3952 BlockDriver *drv = bs->drv;
3953
3954 if (drv && drv->bdrv_ioctl)
3955 return drv->bdrv_ioctl(bs, req, buf);
3956 return -ENOTSUP;
3957 }
3958
3959 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3960 unsigned long int req, void *buf,
3961 BlockDriverCompletionFunc *cb, void *opaque)
3962 {
3963 BlockDriver *drv = bs->drv;
3964
3965 if (drv && drv->bdrv_aio_ioctl)
3966 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3967 return NULL;
3968 }
3969
3970 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3971 {
3972 bs->buffer_alignment = align;
3973 }
3974
3975 void *qemu_blockalign(BlockDriverState *bs, size_t size)
3976 {
3977 return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3978 }
3979
3980 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3981 {
3982 int64_t bitmap_size;
3983
3984 bs->dirty_count = 0;
3985 if (enable) {
3986 if (!bs->dirty_bitmap) {
3987 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3988 BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG - 1;
3989 bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG;
3990
3991 bs->dirty_bitmap = g_new0(unsigned long, bitmap_size);
3992 }
3993 } else {
3994 if (bs->dirty_bitmap) {
3995 g_free(bs->dirty_bitmap);
3996 bs->dirty_bitmap = NULL;
3997 }
3998 }
3999 }
4000
4001 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
4002 {
4003 int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
4004
4005 if (bs->dirty_bitmap &&
4006 (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
4007 return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
4008 (1UL << (chunk % (sizeof(unsigned long) * 8))));
4009 } else {
4010 return 0;
4011 }
4012 }
4013
4014 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
4015 int nr_sectors)
4016 {
4017 set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
4018 }
4019
4020 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
4021 {
4022 return bs->dirty_count;
4023 }
4024
4025 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
4026 {
4027 assert(bs->in_use != in_use);
4028 bs->in_use = in_use;
4029 }
4030
4031 int bdrv_in_use(BlockDriverState *bs)
4032 {
4033 return bs->in_use;
4034 }
4035
4036 void bdrv_iostatus_enable(BlockDriverState *bs)
4037 {
4038 bs->iostatus_enabled = true;
4039 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4040 }
4041
4042 /* The I/O status is only enabled if the drive explicitly
4043 * enables it _and_ the VM is configured to stop on errors */
4044 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
4045 {
4046 return (bs->iostatus_enabled &&
4047 (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
4048 bs->on_write_error == BLOCK_ERR_STOP_ANY ||
4049 bs->on_read_error == BLOCK_ERR_STOP_ANY));
4050 }
4051
4052 void bdrv_iostatus_disable(BlockDriverState *bs)
4053 {
4054 bs->iostatus_enabled = false;
4055 }
4056
4057 void bdrv_iostatus_reset(BlockDriverState *bs)
4058 {
4059 if (bdrv_iostatus_is_enabled(bs)) {
4060 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4061 }
4062 }
4063
4064 /* XXX: Today this is set by device models because it makes the implementation
4065 quite simple. However, the block layer knows about the error, so it's
4066 possible to implement this without device models being involved */
4067 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
4068 {
4069 if (bdrv_iostatus_is_enabled(bs) &&
4070 bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
4071 assert(error >= 0);
4072 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
4073 BLOCK_DEVICE_IO_STATUS_FAILED;
4074 }
4075 }
4076
4077 void
4078 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4079 enum BlockAcctType type)
4080 {
4081 assert(type < BDRV_MAX_IOTYPE);
4082
4083 cookie->bytes = bytes;
4084 cookie->start_time_ns = get_clock();
4085 cookie->type = type;
4086 }
4087
4088 void
4089 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4090 {
4091 assert(cookie->type < BDRV_MAX_IOTYPE);
4092
4093 bs->nr_bytes[cookie->type] += cookie->bytes;
4094 bs->nr_ops[cookie->type]++;
4095 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4096 }
4097
4098 int bdrv_img_create(const char *filename, const char *fmt,
4099 const char *base_filename, const char *base_fmt,
4100 char *options, uint64_t img_size, int flags)
4101 {
4102 QEMUOptionParameter *param = NULL, *create_options = NULL;
4103 QEMUOptionParameter *backing_fmt, *backing_file, *size;
4104 BlockDriverState *bs = NULL;
4105 BlockDriver *drv, *proto_drv;
4106 BlockDriver *backing_drv = NULL;
4107 int ret = 0;
4108
4109 /* Find driver and parse its options */
4110 drv = bdrv_find_format(fmt);
4111 if (!drv) {
4112 error_report("Unknown file format '%s'", fmt);
4113 ret = -EINVAL;
4114 goto out;
4115 }
4116
4117 proto_drv = bdrv_find_protocol(filename);
4118 if (!proto_drv) {
4119 error_report("Unknown protocol '%s'", filename);
4120 ret = -EINVAL;
4121 goto out;
4122 }
4123
4124 create_options = append_option_parameters(create_options,
4125 drv->create_options);
4126 create_options = append_option_parameters(create_options,
4127 proto_drv->create_options);
4128
4129 /* Create parameter list with default values */
4130 param = parse_option_parameters("", create_options, param);
4131
4132 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4133
4134 /* Parse -o options */
4135 if (options) {
4136 param = parse_option_parameters(options, create_options, param);
4137 if (param == NULL) {
4138 error_report("Invalid options for file format '%s'.", fmt);
4139 ret = -EINVAL;
4140 goto out;
4141 }
4142 }
4143
4144 if (base_filename) {
4145 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4146 base_filename)) {
4147 error_report("Backing file not supported for file format '%s'",
4148 fmt);
4149 ret = -EINVAL;
4150 goto out;
4151 }
4152 }
4153
4154 if (base_fmt) {
4155 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4156 error_report("Backing file format not supported for file "
4157 "format '%s'", fmt);
4158 ret = -EINVAL;
4159 goto out;
4160 }
4161 }
4162
4163 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4164 if (backing_file && backing_file->value.s) {
4165 if (!strcmp(filename, backing_file->value.s)) {
4166 error_report("Error: Trying to create an image with the "
4167 "same filename as the backing file");
4168 ret = -EINVAL;
4169 goto out;
4170 }
4171 }
4172
4173 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4174 if (backing_fmt && backing_fmt->value.s) {
4175 backing_drv = bdrv_find_format(backing_fmt->value.s);
4176 if (!backing_drv) {
4177 error_report("Unknown backing file format '%s'",
4178 backing_fmt->value.s);
4179 ret = -EINVAL;
4180 goto out;
4181 }
4182 }
4183
4184 // The size for the image must always be specified, with one exception:
4185 // If we are using a backing file, we can obtain the size from there
4186 size = get_option_parameter(param, BLOCK_OPT_SIZE);
4187 if (size && size->value.n == -1) {
4188 if (backing_file && backing_file->value.s) {
4189 uint64_t size;
4190 char buf[32];
4191 int back_flags;
4192
4193 /* backing files always opened read-only */
4194 back_flags =
4195 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
4196
4197 bs = bdrv_new("");
4198
4199 ret = bdrv_open(bs, backing_file->value.s, back_flags, backing_drv);
4200 if (ret < 0) {
4201 error_report("Could not open '%s'", backing_file->value.s);
4202 goto out;
4203 }
4204 bdrv_get_geometry(bs, &size);
4205 size *= 512;
4206
4207 snprintf(buf, sizeof(buf), "%" PRId64, size);
4208 set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4209 } else {
4210 error_report("Image creation needs a size parameter");
4211 ret = -EINVAL;
4212 goto out;
4213 }
4214 }
4215
4216 printf("Formatting '%s', fmt=%s ", filename, fmt);
4217 print_option_parameters(param);
4218 puts("");
4219
4220 ret = bdrv_create(drv, filename, param);
4221
4222 if (ret < 0) {
4223 if (ret == -ENOTSUP) {
4224 error_report("Formatting or formatting option not supported for "
4225 "file format '%s'", fmt);
4226 } else if (ret == -EFBIG) {
4227 error_report("The image size is too large for file format '%s'",
4228 fmt);
4229 } else {
4230 error_report("%s: error while creating %s: %s", filename, fmt,
4231 strerror(-ret));
4232 }
4233 }
4234
4235 out:
4236 free_option_parameters(create_options);
4237 free_option_parameters(param);
4238
4239 if (bs) {
4240 bdrv_delete(bs);
4241 }
4242
4243 return ret;
4244 }
4245
4246 void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
4247 int64_t speed, BlockDriverCompletionFunc *cb,
4248 void *opaque, Error **errp)
4249 {
4250 BlockJob *job;
4251
4252 if (bs->job || bdrv_in_use(bs)) {
4253 error_set(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
4254 return NULL;
4255 }
4256 bdrv_set_in_use(bs, 1);
4257
4258 job = g_malloc0(job_type->instance_size);
4259 job->job_type = job_type;
4260 job->bs = bs;
4261 job->cb = cb;
4262 job->opaque = opaque;
4263 job->busy = true;
4264 bs->job = job;
4265
4266 /* Only set speed when necessary to avoid NotSupported error */
4267 if (speed != 0) {
4268 Error *local_err = NULL;
4269
4270 block_job_set_speed(job, speed, &local_err);
4271 if (error_is_set(&local_err)) {
4272 bs->job = NULL;
4273 g_free(job);
4274 bdrv_set_in_use(bs, 0);
4275 error_propagate(errp, local_err);
4276 return NULL;
4277 }
4278 }
4279 return job;
4280 }
4281
4282 void block_job_complete(BlockJob *job, int ret)
4283 {
4284 BlockDriverState *bs = job->bs;
4285
4286 assert(bs->job == job);
4287 job->cb(job->opaque, ret);
4288 bs->job = NULL;
4289 g_free(job);
4290 bdrv_set_in_use(bs, 0);
4291 }
4292
4293 void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
4294 {
4295 Error *local_err = NULL;
4296
4297 if (!job->job_type->set_speed) {
4298 error_set(errp, QERR_NOT_SUPPORTED);
4299 return;
4300 }
4301 job->job_type->set_speed(job, speed, &local_err);
4302 if (error_is_set(&local_err)) {
4303 error_propagate(errp, local_err);
4304 return;
4305 }
4306
4307 job->speed = speed;
4308 }
4309
4310 void block_job_cancel(BlockJob *job)
4311 {
4312 job->cancelled = true;
4313 if (job->co && !job->busy) {
4314 qemu_coroutine_enter(job->co, NULL);
4315 }
4316 }
4317
4318 bool block_job_is_cancelled(BlockJob *job)
4319 {
4320 return job->cancelled;
4321 }
4322
4323 struct BlockCancelData {
4324 BlockJob *job;
4325 BlockDriverCompletionFunc *cb;
4326 void *opaque;
4327 bool cancelled;
4328 int ret;
4329 };
4330
4331 static void block_job_cancel_cb(void *opaque, int ret)
4332 {
4333 struct BlockCancelData *data = opaque;
4334
4335 data->cancelled = block_job_is_cancelled(data->job);
4336 data->ret = ret;
4337 data->cb(data->opaque, ret);
4338 }
4339
4340 int block_job_cancel_sync(BlockJob *job)
4341 {
4342 struct BlockCancelData data;
4343 BlockDriverState *bs = job->bs;
4344
4345 assert(bs->job == job);
4346
4347 /* Set up our own callback to store the result and chain to
4348 * the original callback.
4349 */
4350 data.job = job;
4351 data.cb = job->cb;
4352 data.opaque = job->opaque;
4353 data.ret = -EINPROGRESS;
4354 job->cb = block_job_cancel_cb;
4355 job->opaque = &data;
4356 block_job_cancel(job);
4357 while (data.ret == -EINPROGRESS) {
4358 qemu_aio_wait();
4359 }
4360 return (data.cancelled && data.ret == 0) ? -ECANCELED : data.ret;
4361 }
4362
4363 void block_job_sleep_ns(BlockJob *job, QEMUClock *clock, int64_t ns)
4364 {
4365 /* Check cancellation *before* setting busy = false, too! */
4366 if (!block_job_is_cancelled(job)) {
4367 job->busy = false;
4368 co_sleep_ns(clock, ns);
4369 job->busy = true;
4370 }
4371 }