]> git.proxmox.com Git - qemu.git/blob - block.c
271f109cbe5d6eacf6b3619c875d6d15dbfea4ed
[qemu.git] / block.c
1 /*
2 * QEMU System Emulator block driver
3 *
4 * Copyright (c) 2003 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor.h"
28 #include "block_int.h"
29 #include "module.h"
30 #include "qjson.h"
31 #include "qemu-coroutine.h"
32 #include "qmp-commands.h"
33 #include "qemu-timer.h"
34
35 #ifdef CONFIG_BSD
36 #include <sys/types.h>
37 #include <sys/stat.h>
38 #include <sys/ioctl.h>
39 #include <sys/queue.h>
40 #ifndef __DragonFly__
41 #include <sys/disk.h>
42 #endif
43 #endif
44
45 #ifdef _WIN32
46 #include <windows.h>
47 #endif
48
49 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
50
51 typedef enum {
52 BDRV_REQ_COPY_ON_READ = 0x1,
53 BDRV_REQ_ZERO_WRITE = 0x2,
54 } BdrvRequestFlags;
55
56 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
57 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
59 BlockDriverCompletionFunc *cb, void *opaque);
60 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62 BlockDriverCompletionFunc *cb, void *opaque);
63 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64 int64_t sector_num, int nb_sectors,
65 QEMUIOVector *iov);
66 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67 int64_t sector_num, int nb_sectors,
68 QEMUIOVector *iov);
69 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
70 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71 BdrvRequestFlags flags);
72 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
73 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74 BdrvRequestFlags flags);
75 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
76 int64_t sector_num,
77 QEMUIOVector *qiov,
78 int nb_sectors,
79 BlockDriverCompletionFunc *cb,
80 void *opaque,
81 bool is_write);
82 static void coroutine_fn bdrv_co_do_rw(void *opaque);
83 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
84 int64_t sector_num, int nb_sectors);
85
86 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
87 bool is_write, double elapsed_time, uint64_t *wait);
88 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
89 double elapsed_time, uint64_t *wait);
90 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
91 bool is_write, int64_t *wait);
92
93 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
94 QTAILQ_HEAD_INITIALIZER(bdrv_states);
95
96 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
97 QLIST_HEAD_INITIALIZER(bdrv_drivers);
98
99 /* The device to use for VM snapshots */
100 static BlockDriverState *bs_snapshots;
101
102 /* If non-zero, use only whitelisted block drivers */
103 static int use_bdrv_whitelist;
104
105 #ifdef _WIN32
106 static int is_windows_drive_prefix(const char *filename)
107 {
108 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
109 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
110 filename[1] == ':');
111 }
112
113 int is_windows_drive(const char *filename)
114 {
115 if (is_windows_drive_prefix(filename) &&
116 filename[2] == '\0')
117 return 1;
118 if (strstart(filename, "\\\\.\\", NULL) ||
119 strstart(filename, "//./", NULL))
120 return 1;
121 return 0;
122 }
123 #endif
124
125 /* throttling disk I/O limits */
126 void bdrv_io_limits_disable(BlockDriverState *bs)
127 {
128 bs->io_limits_enabled = false;
129
130 while (qemu_co_queue_next(&bs->throttled_reqs));
131
132 if (bs->block_timer) {
133 qemu_del_timer(bs->block_timer);
134 qemu_free_timer(bs->block_timer);
135 bs->block_timer = NULL;
136 }
137
138 bs->slice_start = 0;
139 bs->slice_end = 0;
140 bs->slice_time = 0;
141 memset(&bs->io_base, 0, sizeof(bs->io_base));
142 }
143
144 static void bdrv_block_timer(void *opaque)
145 {
146 BlockDriverState *bs = opaque;
147
148 qemu_co_queue_next(&bs->throttled_reqs);
149 }
150
151 void bdrv_io_limits_enable(BlockDriverState *bs)
152 {
153 qemu_co_queue_init(&bs->throttled_reqs);
154 bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
155 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
156 bs->slice_start = qemu_get_clock_ns(vm_clock);
157 bs->slice_end = bs->slice_start + bs->slice_time;
158 memset(&bs->io_base, 0, sizeof(bs->io_base));
159 bs->io_limits_enabled = true;
160 }
161
162 bool bdrv_io_limits_enabled(BlockDriverState *bs)
163 {
164 BlockIOLimit *io_limits = &bs->io_limits;
165 return io_limits->bps[BLOCK_IO_LIMIT_READ]
166 || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
167 || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
168 || io_limits->iops[BLOCK_IO_LIMIT_READ]
169 || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
170 || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
171 }
172
173 static void bdrv_io_limits_intercept(BlockDriverState *bs,
174 bool is_write, int nb_sectors)
175 {
176 int64_t wait_time = -1;
177
178 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
179 qemu_co_queue_wait(&bs->throttled_reqs);
180 }
181
182 /* In fact, we hope to keep each request's timing, in FIFO mode. The next
183 * throttled requests will not be dequeued until the current request is
184 * allowed to be serviced. So if the current request still exceeds the
185 * limits, it will be inserted to the head. All requests followed it will
186 * be still in throttled_reqs queue.
187 */
188
189 while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
190 qemu_mod_timer(bs->block_timer,
191 wait_time + qemu_get_clock_ns(vm_clock));
192 qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
193 }
194
195 qemu_co_queue_next(&bs->throttled_reqs);
196 }
197
198 /* check if the path starts with "<protocol>:" */
199 static int path_has_protocol(const char *path)
200 {
201 #ifdef _WIN32
202 if (is_windows_drive(path) ||
203 is_windows_drive_prefix(path)) {
204 return 0;
205 }
206 #endif
207
208 return strchr(path, ':') != NULL;
209 }
210
211 int path_is_absolute(const char *path)
212 {
213 const char *p;
214 #ifdef _WIN32
215 /* specific case for names like: "\\.\d:" */
216 if (*path == '/' || *path == '\\')
217 return 1;
218 #endif
219 p = strchr(path, ':');
220 if (p)
221 p++;
222 else
223 p = path;
224 #ifdef _WIN32
225 return (*p == '/' || *p == '\\');
226 #else
227 return (*p == '/');
228 #endif
229 }
230
231 /* if filename is absolute, just copy it to dest. Otherwise, build a
232 path to it by considering it is relative to base_path. URL are
233 supported. */
234 void path_combine(char *dest, int dest_size,
235 const char *base_path,
236 const char *filename)
237 {
238 const char *p, *p1;
239 int len;
240
241 if (dest_size <= 0)
242 return;
243 if (path_is_absolute(filename)) {
244 pstrcpy(dest, dest_size, filename);
245 } else {
246 p = strchr(base_path, ':');
247 if (p)
248 p++;
249 else
250 p = base_path;
251 p1 = strrchr(base_path, '/');
252 #ifdef _WIN32
253 {
254 const char *p2;
255 p2 = strrchr(base_path, '\\');
256 if (!p1 || p2 > p1)
257 p1 = p2;
258 }
259 #endif
260 if (p1)
261 p1++;
262 else
263 p1 = base_path;
264 if (p1 > p)
265 p = p1;
266 len = p - base_path;
267 if (len > dest_size - 1)
268 len = dest_size - 1;
269 memcpy(dest, base_path, len);
270 dest[len] = '\0';
271 pstrcat(dest, dest_size, filename);
272 }
273 }
274
275 void bdrv_register(BlockDriver *bdrv)
276 {
277 /* Block drivers without coroutine functions need emulation */
278 if (!bdrv->bdrv_co_readv) {
279 bdrv->bdrv_co_readv = bdrv_co_readv_em;
280 bdrv->bdrv_co_writev = bdrv_co_writev_em;
281
282 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
283 * the block driver lacks aio we need to emulate that too.
284 */
285 if (!bdrv->bdrv_aio_readv) {
286 /* add AIO emulation layer */
287 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
288 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
289 }
290 }
291
292 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
293 }
294
295 /* create a new block device (by default it is empty) */
296 BlockDriverState *bdrv_new(const char *device_name)
297 {
298 BlockDriverState *bs;
299
300 bs = g_malloc0(sizeof(BlockDriverState));
301 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
302 if (device_name[0] != '\0') {
303 QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
304 }
305 bdrv_iostatus_disable(bs);
306 return bs;
307 }
308
309 BlockDriver *bdrv_find_format(const char *format_name)
310 {
311 BlockDriver *drv1;
312 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
313 if (!strcmp(drv1->format_name, format_name)) {
314 return drv1;
315 }
316 }
317 return NULL;
318 }
319
320 static int bdrv_is_whitelisted(BlockDriver *drv)
321 {
322 static const char *whitelist[] = {
323 CONFIG_BDRV_WHITELIST
324 };
325 const char **p;
326
327 if (!whitelist[0])
328 return 1; /* no whitelist, anything goes */
329
330 for (p = whitelist; *p; p++) {
331 if (!strcmp(drv->format_name, *p)) {
332 return 1;
333 }
334 }
335 return 0;
336 }
337
338 BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
339 {
340 BlockDriver *drv = bdrv_find_format(format_name);
341 return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
342 }
343
344 typedef struct CreateCo {
345 BlockDriver *drv;
346 char *filename;
347 QEMUOptionParameter *options;
348 int ret;
349 } CreateCo;
350
351 static void coroutine_fn bdrv_create_co_entry(void *opaque)
352 {
353 CreateCo *cco = opaque;
354 assert(cco->drv);
355
356 cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
357 }
358
359 int bdrv_create(BlockDriver *drv, const char* filename,
360 QEMUOptionParameter *options)
361 {
362 int ret;
363
364 Coroutine *co;
365 CreateCo cco = {
366 .drv = drv,
367 .filename = g_strdup(filename),
368 .options = options,
369 .ret = NOT_DONE,
370 };
371
372 if (!drv->bdrv_create) {
373 return -ENOTSUP;
374 }
375
376 if (qemu_in_coroutine()) {
377 /* Fast-path if already in coroutine context */
378 bdrv_create_co_entry(&cco);
379 } else {
380 co = qemu_coroutine_create(bdrv_create_co_entry);
381 qemu_coroutine_enter(co, &cco);
382 while (cco.ret == NOT_DONE) {
383 qemu_aio_wait();
384 }
385 }
386
387 ret = cco.ret;
388 g_free(cco.filename);
389
390 return ret;
391 }
392
393 int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
394 {
395 BlockDriver *drv;
396
397 drv = bdrv_find_protocol(filename);
398 if (drv == NULL) {
399 return -ENOENT;
400 }
401
402 return bdrv_create(drv, filename, options);
403 }
404
405 #ifdef _WIN32
406 void get_tmp_filename(char *filename, int size)
407 {
408 char temp_dir[MAX_PATH];
409
410 GetTempPath(MAX_PATH, temp_dir);
411 GetTempFileName(temp_dir, "qem", 0, filename);
412 }
413 #else
414 void get_tmp_filename(char *filename, int size)
415 {
416 int fd;
417 const char *tmpdir;
418 /* XXX: race condition possible */
419 tmpdir = getenv("TMPDIR");
420 if (!tmpdir)
421 tmpdir = "/tmp";
422 snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
423 fd = mkstemp(filename);
424 close(fd);
425 }
426 #endif
427
428 /*
429 * Detect host devices. By convention, /dev/cdrom[N] is always
430 * recognized as a host CDROM.
431 */
432 static BlockDriver *find_hdev_driver(const char *filename)
433 {
434 int score_max = 0, score;
435 BlockDriver *drv = NULL, *d;
436
437 QLIST_FOREACH(d, &bdrv_drivers, list) {
438 if (d->bdrv_probe_device) {
439 score = d->bdrv_probe_device(filename);
440 if (score > score_max) {
441 score_max = score;
442 drv = d;
443 }
444 }
445 }
446
447 return drv;
448 }
449
450 BlockDriver *bdrv_find_protocol(const char *filename)
451 {
452 BlockDriver *drv1;
453 char protocol[128];
454 int len;
455 const char *p;
456
457 /* TODO Drivers without bdrv_file_open must be specified explicitly */
458
459 /*
460 * XXX(hch): we really should not let host device detection
461 * override an explicit protocol specification, but moving this
462 * later breaks access to device names with colons in them.
463 * Thanks to the brain-dead persistent naming schemes on udev-
464 * based Linux systems those actually are quite common.
465 */
466 drv1 = find_hdev_driver(filename);
467 if (drv1) {
468 return drv1;
469 }
470
471 if (!path_has_protocol(filename)) {
472 return bdrv_find_format("file");
473 }
474 p = strchr(filename, ':');
475 assert(p != NULL);
476 len = p - filename;
477 if (len > sizeof(protocol) - 1)
478 len = sizeof(protocol) - 1;
479 memcpy(protocol, filename, len);
480 protocol[len] = '\0';
481 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
482 if (drv1->protocol_name &&
483 !strcmp(drv1->protocol_name, protocol)) {
484 return drv1;
485 }
486 }
487 return NULL;
488 }
489
490 static int find_image_format(const char *filename, BlockDriver **pdrv)
491 {
492 int ret, score, score_max;
493 BlockDriver *drv1, *drv;
494 uint8_t buf[2048];
495 BlockDriverState *bs;
496
497 ret = bdrv_file_open(&bs, filename, 0);
498 if (ret < 0) {
499 *pdrv = NULL;
500 return ret;
501 }
502
503 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
504 if (bs->sg || !bdrv_is_inserted(bs)) {
505 bdrv_delete(bs);
506 drv = bdrv_find_format("raw");
507 if (!drv) {
508 ret = -ENOENT;
509 }
510 *pdrv = drv;
511 return ret;
512 }
513
514 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
515 bdrv_delete(bs);
516 if (ret < 0) {
517 *pdrv = NULL;
518 return ret;
519 }
520
521 score_max = 0;
522 drv = NULL;
523 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
524 if (drv1->bdrv_probe) {
525 score = drv1->bdrv_probe(buf, ret, filename);
526 if (score > score_max) {
527 score_max = score;
528 drv = drv1;
529 }
530 }
531 }
532 if (!drv) {
533 ret = -ENOENT;
534 }
535 *pdrv = drv;
536 return ret;
537 }
538
539 /**
540 * Set the current 'total_sectors' value
541 */
542 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
543 {
544 BlockDriver *drv = bs->drv;
545
546 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
547 if (bs->sg)
548 return 0;
549
550 /* query actual device if possible, otherwise just trust the hint */
551 if (drv->bdrv_getlength) {
552 int64_t length = drv->bdrv_getlength(bs);
553 if (length < 0) {
554 return length;
555 }
556 hint = length >> BDRV_SECTOR_BITS;
557 }
558
559 bs->total_sectors = hint;
560 return 0;
561 }
562
563 /**
564 * Set open flags for a given cache mode
565 *
566 * Return 0 on success, -1 if the cache mode was invalid.
567 */
568 int bdrv_parse_cache_flags(const char *mode, int *flags)
569 {
570 *flags &= ~BDRV_O_CACHE_MASK;
571
572 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
573 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
574 } else if (!strcmp(mode, "directsync")) {
575 *flags |= BDRV_O_NOCACHE;
576 } else if (!strcmp(mode, "writeback")) {
577 *flags |= BDRV_O_CACHE_WB;
578 } else if (!strcmp(mode, "unsafe")) {
579 *flags |= BDRV_O_CACHE_WB;
580 *flags |= BDRV_O_NO_FLUSH;
581 } else if (!strcmp(mode, "writethrough")) {
582 /* this is the default */
583 } else {
584 return -1;
585 }
586
587 return 0;
588 }
589
590 /**
591 * The copy-on-read flag is actually a reference count so multiple users may
592 * use the feature without worrying about clobbering its previous state.
593 * Copy-on-read stays enabled until all users have called to disable it.
594 */
595 void bdrv_enable_copy_on_read(BlockDriverState *bs)
596 {
597 bs->copy_on_read++;
598 }
599
600 void bdrv_disable_copy_on_read(BlockDriverState *bs)
601 {
602 assert(bs->copy_on_read > 0);
603 bs->copy_on_read--;
604 }
605
606 /*
607 * Common part for opening disk images and files
608 */
609 static int bdrv_open_common(BlockDriverState *bs, const char *filename,
610 int flags, BlockDriver *drv)
611 {
612 int ret, open_flags;
613
614 assert(drv != NULL);
615
616 trace_bdrv_open_common(bs, filename, flags, drv->format_name);
617
618 bs->file = NULL;
619 bs->total_sectors = 0;
620 bs->encrypted = 0;
621 bs->valid_key = 0;
622 bs->sg = 0;
623 bs->open_flags = flags;
624 bs->growable = 0;
625 bs->buffer_alignment = 512;
626
627 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
628 if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
629 bdrv_enable_copy_on_read(bs);
630 }
631
632 pstrcpy(bs->filename, sizeof(bs->filename), filename);
633 bs->backing_file[0] = '\0';
634
635 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
636 return -ENOTSUP;
637 }
638
639 bs->drv = drv;
640 bs->opaque = g_malloc0(drv->instance_size);
641
642 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
643
644 /*
645 * Clear flags that are internal to the block layer before opening the
646 * image.
647 */
648 open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
649
650 /*
651 * Snapshots should be writable.
652 */
653 if (bs->is_temporary) {
654 open_flags |= BDRV_O_RDWR;
655 }
656
657 bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
658
659 /* Open the image, either directly or using a protocol */
660 if (drv->bdrv_file_open) {
661 ret = drv->bdrv_file_open(bs, filename, open_flags);
662 } else {
663 ret = bdrv_file_open(&bs->file, filename, open_flags);
664 if (ret >= 0) {
665 ret = drv->bdrv_open(bs, open_flags);
666 }
667 }
668
669 if (ret < 0) {
670 goto free_and_fail;
671 }
672
673 ret = refresh_total_sectors(bs, bs->total_sectors);
674 if (ret < 0) {
675 goto free_and_fail;
676 }
677
678 #ifndef _WIN32
679 if (bs->is_temporary) {
680 unlink(filename);
681 }
682 #endif
683 return 0;
684
685 free_and_fail:
686 if (bs->file) {
687 bdrv_delete(bs->file);
688 bs->file = NULL;
689 }
690 g_free(bs->opaque);
691 bs->opaque = NULL;
692 bs->drv = NULL;
693 return ret;
694 }
695
696 /*
697 * Opens a file using a protocol (file, host_device, nbd, ...)
698 */
699 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
700 {
701 BlockDriverState *bs;
702 BlockDriver *drv;
703 int ret;
704
705 drv = bdrv_find_protocol(filename);
706 if (!drv) {
707 return -ENOENT;
708 }
709
710 bs = bdrv_new("");
711 ret = bdrv_open_common(bs, filename, flags, drv);
712 if (ret < 0) {
713 bdrv_delete(bs);
714 return ret;
715 }
716 bs->growable = 1;
717 *pbs = bs;
718 return 0;
719 }
720
721 /*
722 * Opens a disk image (raw, qcow2, vmdk, ...)
723 */
724 int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
725 BlockDriver *drv)
726 {
727 int ret;
728 char tmp_filename[PATH_MAX];
729
730 if (flags & BDRV_O_SNAPSHOT) {
731 BlockDriverState *bs1;
732 int64_t total_size;
733 int is_protocol = 0;
734 BlockDriver *bdrv_qcow2;
735 QEMUOptionParameter *options;
736 char backing_filename[PATH_MAX];
737
738 /* if snapshot, we create a temporary backing file and open it
739 instead of opening 'filename' directly */
740
741 /* if there is a backing file, use it */
742 bs1 = bdrv_new("");
743 ret = bdrv_open(bs1, filename, 0, drv);
744 if (ret < 0) {
745 bdrv_delete(bs1);
746 return ret;
747 }
748 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
749
750 if (bs1->drv && bs1->drv->protocol_name)
751 is_protocol = 1;
752
753 bdrv_delete(bs1);
754
755 get_tmp_filename(tmp_filename, sizeof(tmp_filename));
756
757 /* Real path is meaningless for protocols */
758 if (is_protocol)
759 snprintf(backing_filename, sizeof(backing_filename),
760 "%s", filename);
761 else if (!realpath(filename, backing_filename))
762 return -errno;
763
764 bdrv_qcow2 = bdrv_find_format("qcow2");
765 options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
766
767 set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
768 set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
769 if (drv) {
770 set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
771 drv->format_name);
772 }
773
774 ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
775 free_option_parameters(options);
776 if (ret < 0) {
777 return ret;
778 }
779
780 filename = tmp_filename;
781 drv = bdrv_qcow2;
782 bs->is_temporary = 1;
783 }
784
785 /* Find the right image format driver */
786 if (!drv) {
787 ret = find_image_format(filename, &drv);
788 }
789
790 if (!drv) {
791 goto unlink_and_fail;
792 }
793
794 /* Open the image */
795 ret = bdrv_open_common(bs, filename, flags, drv);
796 if (ret < 0) {
797 goto unlink_and_fail;
798 }
799
800 /* If there is a backing file, use it */
801 if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
802 char backing_filename[PATH_MAX];
803 int back_flags;
804 BlockDriver *back_drv = NULL;
805
806 bs->backing_hd = bdrv_new("");
807
808 if (path_has_protocol(bs->backing_file)) {
809 pstrcpy(backing_filename, sizeof(backing_filename),
810 bs->backing_file);
811 } else {
812 path_combine(backing_filename, sizeof(backing_filename),
813 filename, bs->backing_file);
814 }
815
816 if (bs->backing_format[0] != '\0') {
817 back_drv = bdrv_find_format(bs->backing_format);
818 }
819
820 /* backing files always opened read-only */
821 back_flags =
822 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
823
824 ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
825 if (ret < 0) {
826 bdrv_close(bs);
827 return ret;
828 }
829 if (bs->is_temporary) {
830 bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
831 } else {
832 /* base image inherits from "parent" */
833 bs->backing_hd->keep_read_only = bs->keep_read_only;
834 }
835 }
836
837 if (!bdrv_key_required(bs)) {
838 bdrv_dev_change_media_cb(bs, true);
839 }
840
841 /* throttling disk I/O limits */
842 if (bs->io_limits_enabled) {
843 bdrv_io_limits_enable(bs);
844 }
845
846 return 0;
847
848 unlink_and_fail:
849 if (bs->is_temporary) {
850 unlink(filename);
851 }
852 return ret;
853 }
854
855 void bdrv_close(BlockDriverState *bs)
856 {
857 bdrv_flush(bs);
858 if (bs->drv) {
859 if (bs->job) {
860 block_job_cancel_sync(bs->job);
861 }
862 bdrv_drain_all();
863
864 if (bs == bs_snapshots) {
865 bs_snapshots = NULL;
866 }
867 if (bs->backing_hd) {
868 bdrv_delete(bs->backing_hd);
869 bs->backing_hd = NULL;
870 }
871 bs->drv->bdrv_close(bs);
872 g_free(bs->opaque);
873 #ifdef _WIN32
874 if (bs->is_temporary) {
875 unlink(bs->filename);
876 }
877 #endif
878 bs->opaque = NULL;
879 bs->drv = NULL;
880 bs->copy_on_read = 0;
881 bs->backing_file[0] = '\0';
882 bs->backing_format[0] = '\0';
883
884 if (bs->file != NULL) {
885 bdrv_close(bs->file);
886 }
887
888 bdrv_dev_change_media_cb(bs, false);
889 }
890
891 /*throttling disk I/O limits*/
892 if (bs->io_limits_enabled) {
893 bdrv_io_limits_disable(bs);
894 }
895 }
896
897 void bdrv_close_all(void)
898 {
899 BlockDriverState *bs;
900
901 QTAILQ_FOREACH(bs, &bdrv_states, list) {
902 bdrv_close(bs);
903 }
904 }
905
906 /*
907 * Wait for pending requests to complete across all BlockDriverStates
908 *
909 * This function does not flush data to disk, use bdrv_flush_all() for that
910 * after calling this function.
911 *
912 * Note that completion of an asynchronous I/O operation can trigger any
913 * number of other I/O operations on other devices---for example a coroutine
914 * can be arbitrarily complex and a constant flow of I/O can come until the
915 * coroutine is complete. Because of this, it is not possible to have a
916 * function to drain a single device's I/O queue.
917 */
918 void bdrv_drain_all(void)
919 {
920 BlockDriverState *bs;
921 bool busy;
922
923 do {
924 busy = qemu_aio_wait();
925
926 /* FIXME: We do not have timer support here, so this is effectively
927 * a busy wait.
928 */
929 QTAILQ_FOREACH(bs, &bdrv_states, list) {
930 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
931 qemu_co_queue_restart_all(&bs->throttled_reqs);
932 busy = true;
933 }
934 }
935 } while (busy);
936
937 /* If requests are still pending there is a bug somewhere */
938 QTAILQ_FOREACH(bs, &bdrv_states, list) {
939 assert(QLIST_EMPTY(&bs->tracked_requests));
940 assert(qemu_co_queue_empty(&bs->throttled_reqs));
941 }
942 }
943
944 /* make a BlockDriverState anonymous by removing from bdrv_state list.
945 Also, NULL terminate the device_name to prevent double remove */
946 void bdrv_make_anon(BlockDriverState *bs)
947 {
948 if (bs->device_name[0] != '\0') {
949 QTAILQ_REMOVE(&bdrv_states, bs, list);
950 }
951 bs->device_name[0] = '\0';
952 }
953
954 static void bdrv_rebind(BlockDriverState *bs)
955 {
956 if (bs->drv && bs->drv->bdrv_rebind) {
957 bs->drv->bdrv_rebind(bs);
958 }
959 }
960
961 /*
962 * Add new bs contents at the top of an image chain while the chain is
963 * live, while keeping required fields on the top layer.
964 *
965 * This will modify the BlockDriverState fields, and swap contents
966 * between bs_new and bs_top. Both bs_new and bs_top are modified.
967 *
968 * bs_new is required to be anonymous.
969 *
970 * This function does not create any image files.
971 */
972 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
973 {
974 BlockDriverState tmp;
975
976 /* bs_new must be anonymous */
977 assert(bs_new->device_name[0] == '\0');
978
979 tmp = *bs_new;
980
981 /* there are some fields that need to stay on the top layer: */
982 tmp.open_flags = bs_top->open_flags;
983
984 /* dev info */
985 tmp.dev_ops = bs_top->dev_ops;
986 tmp.dev_opaque = bs_top->dev_opaque;
987 tmp.dev = bs_top->dev;
988 tmp.buffer_alignment = bs_top->buffer_alignment;
989 tmp.copy_on_read = bs_top->copy_on_read;
990
991 /* i/o timing parameters */
992 tmp.slice_time = bs_top->slice_time;
993 tmp.slice_start = bs_top->slice_start;
994 tmp.slice_end = bs_top->slice_end;
995 tmp.io_limits = bs_top->io_limits;
996 tmp.io_base = bs_top->io_base;
997 tmp.throttled_reqs = bs_top->throttled_reqs;
998 tmp.block_timer = bs_top->block_timer;
999 tmp.io_limits_enabled = bs_top->io_limits_enabled;
1000
1001 /* geometry */
1002 tmp.cyls = bs_top->cyls;
1003 tmp.heads = bs_top->heads;
1004 tmp.secs = bs_top->secs;
1005 tmp.translation = bs_top->translation;
1006
1007 /* r/w error */
1008 tmp.on_read_error = bs_top->on_read_error;
1009 tmp.on_write_error = bs_top->on_write_error;
1010
1011 /* i/o status */
1012 tmp.iostatus_enabled = bs_top->iostatus_enabled;
1013 tmp.iostatus = bs_top->iostatus;
1014
1015 /* keep the same entry in bdrv_states */
1016 pstrcpy(tmp.device_name, sizeof(tmp.device_name), bs_top->device_name);
1017 tmp.list = bs_top->list;
1018
1019 /* The contents of 'tmp' will become bs_top, as we are
1020 * swapping bs_new and bs_top contents. */
1021 tmp.backing_hd = bs_new;
1022 pstrcpy(tmp.backing_file, sizeof(tmp.backing_file), bs_top->filename);
1023 bdrv_get_format(bs_top, tmp.backing_format, sizeof(tmp.backing_format));
1024
1025 /* swap contents of the fixed new bs and the current top */
1026 *bs_new = *bs_top;
1027 *bs_top = tmp;
1028
1029 /* device_name[] was carried over from the old bs_top. bs_new
1030 * shouldn't be in bdrv_states, so we need to make device_name[]
1031 * reflect the anonymity of bs_new
1032 */
1033 bs_new->device_name[0] = '\0';
1034
1035 /* clear the copied fields in the new backing file */
1036 bdrv_detach_dev(bs_new, bs_new->dev);
1037
1038 qemu_co_queue_init(&bs_new->throttled_reqs);
1039 memset(&bs_new->io_base, 0, sizeof(bs_new->io_base));
1040 memset(&bs_new->io_limits, 0, sizeof(bs_new->io_limits));
1041 bdrv_iostatus_disable(bs_new);
1042
1043 /* we don't use bdrv_io_limits_disable() for this, because we don't want
1044 * to affect or delete the block_timer, as it has been moved to bs_top */
1045 bs_new->io_limits_enabled = false;
1046 bs_new->block_timer = NULL;
1047 bs_new->slice_time = 0;
1048 bs_new->slice_start = 0;
1049 bs_new->slice_end = 0;
1050
1051 bdrv_rebind(bs_new);
1052 bdrv_rebind(bs_top);
1053 }
1054
1055 void bdrv_delete(BlockDriverState *bs)
1056 {
1057 assert(!bs->dev);
1058 assert(!bs->job);
1059 assert(!bs->in_use);
1060
1061 /* remove from list, if necessary */
1062 bdrv_make_anon(bs);
1063
1064 bdrv_close(bs);
1065 if (bs->file != NULL) {
1066 bdrv_delete(bs->file);
1067 }
1068
1069 assert(bs != bs_snapshots);
1070 g_free(bs);
1071 }
1072
1073 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1074 /* TODO change to DeviceState *dev when all users are qdevified */
1075 {
1076 if (bs->dev) {
1077 return -EBUSY;
1078 }
1079 bs->dev = dev;
1080 bdrv_iostatus_reset(bs);
1081 return 0;
1082 }
1083
1084 /* TODO qdevified devices don't use this, remove when devices are qdevified */
1085 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1086 {
1087 if (bdrv_attach_dev(bs, dev) < 0) {
1088 abort();
1089 }
1090 }
1091
1092 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1093 /* TODO change to DeviceState *dev when all users are qdevified */
1094 {
1095 assert(bs->dev == dev);
1096 bs->dev = NULL;
1097 bs->dev_ops = NULL;
1098 bs->dev_opaque = NULL;
1099 bs->buffer_alignment = 512;
1100 }
1101
1102 /* TODO change to return DeviceState * when all users are qdevified */
1103 void *bdrv_get_attached_dev(BlockDriverState *bs)
1104 {
1105 return bs->dev;
1106 }
1107
1108 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1109 void *opaque)
1110 {
1111 bs->dev_ops = ops;
1112 bs->dev_opaque = opaque;
1113 if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1114 bs_snapshots = NULL;
1115 }
1116 }
1117
1118 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1119 BlockQMPEventAction action, int is_read)
1120 {
1121 QObject *data;
1122 const char *action_str;
1123
1124 switch (action) {
1125 case BDRV_ACTION_REPORT:
1126 action_str = "report";
1127 break;
1128 case BDRV_ACTION_IGNORE:
1129 action_str = "ignore";
1130 break;
1131 case BDRV_ACTION_STOP:
1132 action_str = "stop";
1133 break;
1134 default:
1135 abort();
1136 }
1137
1138 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1139 bdrv->device_name,
1140 action_str,
1141 is_read ? "read" : "write");
1142 monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1143
1144 qobject_decref(data);
1145 }
1146
1147 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1148 {
1149 QObject *data;
1150
1151 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1152 bdrv_get_device_name(bs), ejected);
1153 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1154
1155 qobject_decref(data);
1156 }
1157
1158 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1159 {
1160 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1161 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1162 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1163 if (tray_was_closed) {
1164 /* tray open */
1165 bdrv_emit_qmp_eject_event(bs, true);
1166 }
1167 if (load) {
1168 /* tray close */
1169 bdrv_emit_qmp_eject_event(bs, false);
1170 }
1171 }
1172 }
1173
1174 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1175 {
1176 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1177 }
1178
1179 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1180 {
1181 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1182 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1183 }
1184 }
1185
1186 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1187 {
1188 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1189 return bs->dev_ops->is_tray_open(bs->dev_opaque);
1190 }
1191 return false;
1192 }
1193
1194 static void bdrv_dev_resize_cb(BlockDriverState *bs)
1195 {
1196 if (bs->dev_ops && bs->dev_ops->resize_cb) {
1197 bs->dev_ops->resize_cb(bs->dev_opaque);
1198 }
1199 }
1200
1201 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1202 {
1203 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1204 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1205 }
1206 return false;
1207 }
1208
1209 /*
1210 * Run consistency checks on an image
1211 *
1212 * Returns 0 if the check could be completed (it doesn't mean that the image is
1213 * free of errors) or -errno when an internal error occurred. The results of the
1214 * check are stored in res.
1215 */
1216 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
1217 {
1218 if (bs->drv->bdrv_check == NULL) {
1219 return -ENOTSUP;
1220 }
1221
1222 memset(res, 0, sizeof(*res));
1223 return bs->drv->bdrv_check(bs, res);
1224 }
1225
1226 #define COMMIT_BUF_SECTORS 2048
1227
1228 /* commit COW file into the raw image */
1229 int bdrv_commit(BlockDriverState *bs)
1230 {
1231 BlockDriver *drv = bs->drv;
1232 BlockDriver *backing_drv;
1233 int64_t sector, total_sectors;
1234 int n, ro, open_flags;
1235 int ret = 0, rw_ret = 0;
1236 uint8_t *buf;
1237 char filename[1024];
1238 BlockDriverState *bs_rw, *bs_ro;
1239
1240 if (!drv)
1241 return -ENOMEDIUM;
1242
1243 if (!bs->backing_hd) {
1244 return -ENOTSUP;
1245 }
1246
1247 if (bs->backing_hd->keep_read_only) {
1248 return -EACCES;
1249 }
1250
1251 if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1252 return -EBUSY;
1253 }
1254
1255 backing_drv = bs->backing_hd->drv;
1256 ro = bs->backing_hd->read_only;
1257 strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1258 open_flags = bs->backing_hd->open_flags;
1259
1260 if (ro) {
1261 /* re-open as RW */
1262 bdrv_delete(bs->backing_hd);
1263 bs->backing_hd = NULL;
1264 bs_rw = bdrv_new("");
1265 rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1266 backing_drv);
1267 if (rw_ret < 0) {
1268 bdrv_delete(bs_rw);
1269 /* try to re-open read-only */
1270 bs_ro = bdrv_new("");
1271 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1272 backing_drv);
1273 if (ret < 0) {
1274 bdrv_delete(bs_ro);
1275 /* drive not functional anymore */
1276 bs->drv = NULL;
1277 return ret;
1278 }
1279 bs->backing_hd = bs_ro;
1280 return rw_ret;
1281 }
1282 bs->backing_hd = bs_rw;
1283 }
1284
1285 total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1286 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1287
1288 for (sector = 0; sector < total_sectors; sector += n) {
1289 if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1290
1291 if (bdrv_read(bs, sector, buf, n) != 0) {
1292 ret = -EIO;
1293 goto ro_cleanup;
1294 }
1295
1296 if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1297 ret = -EIO;
1298 goto ro_cleanup;
1299 }
1300 }
1301 }
1302
1303 if (drv->bdrv_make_empty) {
1304 ret = drv->bdrv_make_empty(bs);
1305 bdrv_flush(bs);
1306 }
1307
1308 /*
1309 * Make sure all data we wrote to the backing device is actually
1310 * stable on disk.
1311 */
1312 if (bs->backing_hd)
1313 bdrv_flush(bs->backing_hd);
1314
1315 ro_cleanup:
1316 g_free(buf);
1317
1318 if (ro) {
1319 /* re-open as RO */
1320 bdrv_delete(bs->backing_hd);
1321 bs->backing_hd = NULL;
1322 bs_ro = bdrv_new("");
1323 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1324 backing_drv);
1325 if (ret < 0) {
1326 bdrv_delete(bs_ro);
1327 /* drive not functional anymore */
1328 bs->drv = NULL;
1329 return ret;
1330 }
1331 bs->backing_hd = bs_ro;
1332 bs->backing_hd->keep_read_only = 0;
1333 }
1334
1335 return ret;
1336 }
1337
1338 int bdrv_commit_all(void)
1339 {
1340 BlockDriverState *bs;
1341
1342 QTAILQ_FOREACH(bs, &bdrv_states, list) {
1343 int ret = bdrv_commit(bs);
1344 if (ret < 0) {
1345 return ret;
1346 }
1347 }
1348 return 0;
1349 }
1350
1351 struct BdrvTrackedRequest {
1352 BlockDriverState *bs;
1353 int64_t sector_num;
1354 int nb_sectors;
1355 bool is_write;
1356 QLIST_ENTRY(BdrvTrackedRequest) list;
1357 Coroutine *co; /* owner, used for deadlock detection */
1358 CoQueue wait_queue; /* coroutines blocked on this request */
1359 };
1360
1361 /**
1362 * Remove an active request from the tracked requests list
1363 *
1364 * This function should be called when a tracked request is completing.
1365 */
1366 static void tracked_request_end(BdrvTrackedRequest *req)
1367 {
1368 QLIST_REMOVE(req, list);
1369 qemu_co_queue_restart_all(&req->wait_queue);
1370 }
1371
1372 /**
1373 * Add an active request to the tracked requests list
1374 */
1375 static void tracked_request_begin(BdrvTrackedRequest *req,
1376 BlockDriverState *bs,
1377 int64_t sector_num,
1378 int nb_sectors, bool is_write)
1379 {
1380 *req = (BdrvTrackedRequest){
1381 .bs = bs,
1382 .sector_num = sector_num,
1383 .nb_sectors = nb_sectors,
1384 .is_write = is_write,
1385 .co = qemu_coroutine_self(),
1386 };
1387
1388 qemu_co_queue_init(&req->wait_queue);
1389
1390 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1391 }
1392
1393 /**
1394 * Round a region to cluster boundaries
1395 */
1396 static void round_to_clusters(BlockDriverState *bs,
1397 int64_t sector_num, int nb_sectors,
1398 int64_t *cluster_sector_num,
1399 int *cluster_nb_sectors)
1400 {
1401 BlockDriverInfo bdi;
1402
1403 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1404 *cluster_sector_num = sector_num;
1405 *cluster_nb_sectors = nb_sectors;
1406 } else {
1407 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1408 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1409 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1410 nb_sectors, c);
1411 }
1412 }
1413
1414 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1415 int64_t sector_num, int nb_sectors) {
1416 /* aaaa bbbb */
1417 if (sector_num >= req->sector_num + req->nb_sectors) {
1418 return false;
1419 }
1420 /* bbbb aaaa */
1421 if (req->sector_num >= sector_num + nb_sectors) {
1422 return false;
1423 }
1424 return true;
1425 }
1426
1427 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1428 int64_t sector_num, int nb_sectors)
1429 {
1430 BdrvTrackedRequest *req;
1431 int64_t cluster_sector_num;
1432 int cluster_nb_sectors;
1433 bool retry;
1434
1435 /* If we touch the same cluster it counts as an overlap. This guarantees
1436 * that allocating writes will be serialized and not race with each other
1437 * for the same cluster. For example, in copy-on-read it ensures that the
1438 * CoR read and write operations are atomic and guest writes cannot
1439 * interleave between them.
1440 */
1441 round_to_clusters(bs, sector_num, nb_sectors,
1442 &cluster_sector_num, &cluster_nb_sectors);
1443
1444 do {
1445 retry = false;
1446 QLIST_FOREACH(req, &bs->tracked_requests, list) {
1447 if (tracked_request_overlaps(req, cluster_sector_num,
1448 cluster_nb_sectors)) {
1449 /* Hitting this means there was a reentrant request, for
1450 * example, a block driver issuing nested requests. This must
1451 * never happen since it means deadlock.
1452 */
1453 assert(qemu_coroutine_self() != req->co);
1454
1455 qemu_co_queue_wait(&req->wait_queue);
1456 retry = true;
1457 break;
1458 }
1459 }
1460 } while (retry);
1461 }
1462
1463 /*
1464 * Return values:
1465 * 0 - success
1466 * -EINVAL - backing format specified, but no file
1467 * -ENOSPC - can't update the backing file because no space is left in the
1468 * image file header
1469 * -ENOTSUP - format driver doesn't support changing the backing file
1470 */
1471 int bdrv_change_backing_file(BlockDriverState *bs,
1472 const char *backing_file, const char *backing_fmt)
1473 {
1474 BlockDriver *drv = bs->drv;
1475 int ret;
1476
1477 /* Backing file format doesn't make sense without a backing file */
1478 if (backing_fmt && !backing_file) {
1479 return -EINVAL;
1480 }
1481
1482 if (drv->bdrv_change_backing_file != NULL) {
1483 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1484 } else {
1485 ret = -ENOTSUP;
1486 }
1487
1488 if (ret == 0) {
1489 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1490 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1491 }
1492 return ret;
1493 }
1494
1495 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1496 size_t size)
1497 {
1498 int64_t len;
1499
1500 if (!bdrv_is_inserted(bs))
1501 return -ENOMEDIUM;
1502
1503 if (bs->growable)
1504 return 0;
1505
1506 len = bdrv_getlength(bs);
1507
1508 if (offset < 0)
1509 return -EIO;
1510
1511 if ((offset > len) || (len - offset < size))
1512 return -EIO;
1513
1514 return 0;
1515 }
1516
1517 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1518 int nb_sectors)
1519 {
1520 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1521 nb_sectors * BDRV_SECTOR_SIZE);
1522 }
1523
1524 typedef struct RwCo {
1525 BlockDriverState *bs;
1526 int64_t sector_num;
1527 int nb_sectors;
1528 QEMUIOVector *qiov;
1529 bool is_write;
1530 int ret;
1531 } RwCo;
1532
1533 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1534 {
1535 RwCo *rwco = opaque;
1536
1537 if (!rwco->is_write) {
1538 rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1539 rwco->nb_sectors, rwco->qiov, 0);
1540 } else {
1541 rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1542 rwco->nb_sectors, rwco->qiov, 0);
1543 }
1544 }
1545
1546 /*
1547 * Process a synchronous request using coroutines
1548 */
1549 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1550 int nb_sectors, bool is_write)
1551 {
1552 QEMUIOVector qiov;
1553 struct iovec iov = {
1554 .iov_base = (void *)buf,
1555 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1556 };
1557 Coroutine *co;
1558 RwCo rwco = {
1559 .bs = bs,
1560 .sector_num = sector_num,
1561 .nb_sectors = nb_sectors,
1562 .qiov = &qiov,
1563 .is_write = is_write,
1564 .ret = NOT_DONE,
1565 };
1566
1567 qemu_iovec_init_external(&qiov, &iov, 1);
1568
1569 /**
1570 * In sync call context, when the vcpu is blocked, this throttling timer
1571 * will not fire; so the I/O throttling function has to be disabled here
1572 * if it has been enabled.
1573 */
1574 if (bs->io_limits_enabled) {
1575 fprintf(stderr, "Disabling I/O throttling on '%s' due "
1576 "to synchronous I/O.\n", bdrv_get_device_name(bs));
1577 bdrv_io_limits_disable(bs);
1578 }
1579
1580 if (qemu_in_coroutine()) {
1581 /* Fast-path if already in coroutine context */
1582 bdrv_rw_co_entry(&rwco);
1583 } else {
1584 co = qemu_coroutine_create(bdrv_rw_co_entry);
1585 qemu_coroutine_enter(co, &rwco);
1586 while (rwco.ret == NOT_DONE) {
1587 qemu_aio_wait();
1588 }
1589 }
1590 return rwco.ret;
1591 }
1592
1593 /* return < 0 if error. See bdrv_write() for the return codes */
1594 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1595 uint8_t *buf, int nb_sectors)
1596 {
1597 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1598 }
1599
1600 #define BITS_PER_LONG (sizeof(unsigned long) * 8)
1601
1602 static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1603 int nb_sectors, int dirty)
1604 {
1605 int64_t start, end;
1606 unsigned long val, idx, bit;
1607
1608 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1609 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1610
1611 for (; start <= end; start++) {
1612 idx = start / BITS_PER_LONG;
1613 bit = start % BITS_PER_LONG;
1614 val = bs->dirty_bitmap[idx];
1615 if (dirty) {
1616 if (!(val & (1UL << bit))) {
1617 bs->dirty_count++;
1618 val |= 1UL << bit;
1619 }
1620 } else {
1621 if (val & (1UL << bit)) {
1622 bs->dirty_count--;
1623 val &= ~(1UL << bit);
1624 }
1625 }
1626 bs->dirty_bitmap[idx] = val;
1627 }
1628 }
1629
1630 /* Return < 0 if error. Important errors are:
1631 -EIO generic I/O error (may happen for all errors)
1632 -ENOMEDIUM No media inserted.
1633 -EINVAL Invalid sector number or nb_sectors
1634 -EACCES Trying to write a read-only device
1635 */
1636 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1637 const uint8_t *buf, int nb_sectors)
1638 {
1639 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1640 }
1641
1642 int bdrv_pread(BlockDriverState *bs, int64_t offset,
1643 void *buf, int count1)
1644 {
1645 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1646 int len, nb_sectors, count;
1647 int64_t sector_num;
1648 int ret;
1649
1650 count = count1;
1651 /* first read to align to sector start */
1652 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1653 if (len > count)
1654 len = count;
1655 sector_num = offset >> BDRV_SECTOR_BITS;
1656 if (len > 0) {
1657 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1658 return ret;
1659 memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1660 count -= len;
1661 if (count == 0)
1662 return count1;
1663 sector_num++;
1664 buf += len;
1665 }
1666
1667 /* read the sectors "in place" */
1668 nb_sectors = count >> BDRV_SECTOR_BITS;
1669 if (nb_sectors > 0) {
1670 if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1671 return ret;
1672 sector_num += nb_sectors;
1673 len = nb_sectors << BDRV_SECTOR_BITS;
1674 buf += len;
1675 count -= len;
1676 }
1677
1678 /* add data from the last sector */
1679 if (count > 0) {
1680 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1681 return ret;
1682 memcpy(buf, tmp_buf, count);
1683 }
1684 return count1;
1685 }
1686
1687 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1688 const void *buf, int count1)
1689 {
1690 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1691 int len, nb_sectors, count;
1692 int64_t sector_num;
1693 int ret;
1694
1695 count = count1;
1696 /* first write to align to sector start */
1697 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1698 if (len > count)
1699 len = count;
1700 sector_num = offset >> BDRV_SECTOR_BITS;
1701 if (len > 0) {
1702 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1703 return ret;
1704 memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1705 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1706 return ret;
1707 count -= len;
1708 if (count == 0)
1709 return count1;
1710 sector_num++;
1711 buf += len;
1712 }
1713
1714 /* write the sectors "in place" */
1715 nb_sectors = count >> BDRV_SECTOR_BITS;
1716 if (nb_sectors > 0) {
1717 if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1718 return ret;
1719 sector_num += nb_sectors;
1720 len = nb_sectors << BDRV_SECTOR_BITS;
1721 buf += len;
1722 count -= len;
1723 }
1724
1725 /* add data from the last sector */
1726 if (count > 0) {
1727 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1728 return ret;
1729 memcpy(tmp_buf, buf, count);
1730 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1731 return ret;
1732 }
1733 return count1;
1734 }
1735
1736 /*
1737 * Writes to the file and ensures that no writes are reordered across this
1738 * request (acts as a barrier)
1739 *
1740 * Returns 0 on success, -errno in error cases.
1741 */
1742 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1743 const void *buf, int count)
1744 {
1745 int ret;
1746
1747 ret = bdrv_pwrite(bs, offset, buf, count);
1748 if (ret < 0) {
1749 return ret;
1750 }
1751
1752 /* No flush needed for cache modes that use O_DSYNC */
1753 if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
1754 bdrv_flush(bs);
1755 }
1756
1757 return 0;
1758 }
1759
1760 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
1761 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1762 {
1763 /* Perform I/O through a temporary buffer so that users who scribble over
1764 * their read buffer while the operation is in progress do not end up
1765 * modifying the image file. This is critical for zero-copy guest I/O
1766 * where anything might happen inside guest memory.
1767 */
1768 void *bounce_buffer;
1769
1770 BlockDriver *drv = bs->drv;
1771 struct iovec iov;
1772 QEMUIOVector bounce_qiov;
1773 int64_t cluster_sector_num;
1774 int cluster_nb_sectors;
1775 size_t skip_bytes;
1776 int ret;
1777
1778 /* Cover entire cluster so no additional backing file I/O is required when
1779 * allocating cluster in the image file.
1780 */
1781 round_to_clusters(bs, sector_num, nb_sectors,
1782 &cluster_sector_num, &cluster_nb_sectors);
1783
1784 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1785 cluster_sector_num, cluster_nb_sectors);
1786
1787 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1788 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1789 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1790
1791 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1792 &bounce_qiov);
1793 if (ret < 0) {
1794 goto err;
1795 }
1796
1797 if (drv->bdrv_co_write_zeroes &&
1798 buffer_is_zero(bounce_buffer, iov.iov_len)) {
1799 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
1800 cluster_nb_sectors);
1801 } else {
1802 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
1803 &bounce_qiov);
1804 }
1805
1806 if (ret < 0) {
1807 /* It might be okay to ignore write errors for guest requests. If this
1808 * is a deliberate copy-on-read then we don't want to ignore the error.
1809 * Simply report it in all cases.
1810 */
1811 goto err;
1812 }
1813
1814 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1815 qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
1816 nb_sectors * BDRV_SECTOR_SIZE);
1817
1818 err:
1819 qemu_vfree(bounce_buffer);
1820 return ret;
1821 }
1822
1823 /*
1824 * Handle a read request in coroutine context
1825 */
1826 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1827 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1828 BdrvRequestFlags flags)
1829 {
1830 BlockDriver *drv = bs->drv;
1831 BdrvTrackedRequest req;
1832 int ret;
1833
1834 if (!drv) {
1835 return -ENOMEDIUM;
1836 }
1837 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1838 return -EIO;
1839 }
1840
1841 /* throttling disk read I/O */
1842 if (bs->io_limits_enabled) {
1843 bdrv_io_limits_intercept(bs, false, nb_sectors);
1844 }
1845
1846 if (bs->copy_on_read) {
1847 flags |= BDRV_REQ_COPY_ON_READ;
1848 }
1849 if (flags & BDRV_REQ_COPY_ON_READ) {
1850 bs->copy_on_read_in_flight++;
1851 }
1852
1853 if (bs->copy_on_read_in_flight) {
1854 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1855 }
1856
1857 tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
1858
1859 if (flags & BDRV_REQ_COPY_ON_READ) {
1860 int pnum;
1861
1862 ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1863 if (ret < 0) {
1864 goto out;
1865 }
1866
1867 if (!ret || pnum != nb_sectors) {
1868 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
1869 goto out;
1870 }
1871 }
1872
1873 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1874
1875 out:
1876 tracked_request_end(&req);
1877
1878 if (flags & BDRV_REQ_COPY_ON_READ) {
1879 bs->copy_on_read_in_flight--;
1880 }
1881
1882 return ret;
1883 }
1884
1885 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1886 int nb_sectors, QEMUIOVector *qiov)
1887 {
1888 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1889
1890 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1891 }
1892
1893 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1894 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1895 {
1896 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1897
1898 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1899 BDRV_REQ_COPY_ON_READ);
1900 }
1901
1902 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1903 int64_t sector_num, int nb_sectors)
1904 {
1905 BlockDriver *drv = bs->drv;
1906 QEMUIOVector qiov;
1907 struct iovec iov;
1908 int ret;
1909
1910 /* TODO Emulate only part of misaligned requests instead of letting block
1911 * drivers return -ENOTSUP and emulate everything */
1912
1913 /* First try the efficient write zeroes operation */
1914 if (drv->bdrv_co_write_zeroes) {
1915 ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1916 if (ret != -ENOTSUP) {
1917 return ret;
1918 }
1919 }
1920
1921 /* Fall back to bounce buffer if write zeroes is unsupported */
1922 iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
1923 iov.iov_base = qemu_blockalign(bs, iov.iov_len);
1924 memset(iov.iov_base, 0, iov.iov_len);
1925 qemu_iovec_init_external(&qiov, &iov, 1);
1926
1927 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1928
1929 qemu_vfree(iov.iov_base);
1930 return ret;
1931 }
1932
1933 /*
1934 * Handle a write request in coroutine context
1935 */
1936 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1937 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1938 BdrvRequestFlags flags)
1939 {
1940 BlockDriver *drv = bs->drv;
1941 BdrvTrackedRequest req;
1942 int ret;
1943
1944 if (!bs->drv) {
1945 return -ENOMEDIUM;
1946 }
1947 if (bs->read_only) {
1948 return -EACCES;
1949 }
1950 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1951 return -EIO;
1952 }
1953
1954 /* throttling disk write I/O */
1955 if (bs->io_limits_enabled) {
1956 bdrv_io_limits_intercept(bs, true, nb_sectors);
1957 }
1958
1959 if (bs->copy_on_read_in_flight) {
1960 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1961 }
1962
1963 tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
1964
1965 if (flags & BDRV_REQ_ZERO_WRITE) {
1966 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
1967 } else {
1968 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1969 }
1970
1971 if (bs->dirty_bitmap) {
1972 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1973 }
1974
1975 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1976 bs->wr_highest_sector = sector_num + nb_sectors - 1;
1977 }
1978
1979 tracked_request_end(&req);
1980
1981 return ret;
1982 }
1983
1984 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1985 int nb_sectors, QEMUIOVector *qiov)
1986 {
1987 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1988
1989 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
1990 }
1991
1992 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
1993 int64_t sector_num, int nb_sectors)
1994 {
1995 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1996
1997 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
1998 BDRV_REQ_ZERO_WRITE);
1999 }
2000
2001 /**
2002 * Truncate file to 'offset' bytes (needed only for file protocols)
2003 */
2004 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2005 {
2006 BlockDriver *drv = bs->drv;
2007 int ret;
2008 if (!drv)
2009 return -ENOMEDIUM;
2010 if (!drv->bdrv_truncate)
2011 return -ENOTSUP;
2012 if (bs->read_only)
2013 return -EACCES;
2014 if (bdrv_in_use(bs))
2015 return -EBUSY;
2016 ret = drv->bdrv_truncate(bs, offset);
2017 if (ret == 0) {
2018 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2019 bdrv_dev_resize_cb(bs);
2020 }
2021 return ret;
2022 }
2023
2024 /**
2025 * Length of a allocated file in bytes. Sparse files are counted by actual
2026 * allocated space. Return < 0 if error or unknown.
2027 */
2028 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2029 {
2030 BlockDriver *drv = bs->drv;
2031 if (!drv) {
2032 return -ENOMEDIUM;
2033 }
2034 if (drv->bdrv_get_allocated_file_size) {
2035 return drv->bdrv_get_allocated_file_size(bs);
2036 }
2037 if (bs->file) {
2038 return bdrv_get_allocated_file_size(bs->file);
2039 }
2040 return -ENOTSUP;
2041 }
2042
2043 /**
2044 * Length of a file in bytes. Return < 0 if error or unknown.
2045 */
2046 int64_t bdrv_getlength(BlockDriverState *bs)
2047 {
2048 BlockDriver *drv = bs->drv;
2049 if (!drv)
2050 return -ENOMEDIUM;
2051
2052 if (bs->growable || bdrv_dev_has_removable_media(bs)) {
2053 if (drv->bdrv_getlength) {
2054 return drv->bdrv_getlength(bs);
2055 }
2056 }
2057 return bs->total_sectors * BDRV_SECTOR_SIZE;
2058 }
2059
2060 /* return 0 as number of sectors if no device present or error */
2061 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2062 {
2063 int64_t length;
2064 length = bdrv_getlength(bs);
2065 if (length < 0)
2066 length = 0;
2067 else
2068 length = length >> BDRV_SECTOR_BITS;
2069 *nb_sectors_ptr = length;
2070 }
2071
2072 struct partition {
2073 uint8_t boot_ind; /* 0x80 - active */
2074 uint8_t head; /* starting head */
2075 uint8_t sector; /* starting sector */
2076 uint8_t cyl; /* starting cylinder */
2077 uint8_t sys_ind; /* What partition type */
2078 uint8_t end_head; /* end head */
2079 uint8_t end_sector; /* end sector */
2080 uint8_t end_cyl; /* end cylinder */
2081 uint32_t start_sect; /* starting sector counting from 0 */
2082 uint32_t nr_sects; /* nr of sectors in partition */
2083 } QEMU_PACKED;
2084
2085 /* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
2086 static int guess_disk_lchs(BlockDriverState *bs,
2087 int *pcylinders, int *pheads, int *psectors)
2088 {
2089 uint8_t buf[BDRV_SECTOR_SIZE];
2090 int ret, i, heads, sectors, cylinders;
2091 struct partition *p;
2092 uint32_t nr_sects;
2093 uint64_t nb_sectors;
2094 bool enabled;
2095
2096 bdrv_get_geometry(bs, &nb_sectors);
2097
2098 /**
2099 * The function will be invoked during startup not only in sync I/O mode,
2100 * but also in async I/O mode. So the I/O throttling function has to
2101 * be disabled temporarily here, not permanently.
2102 */
2103 enabled = bs->io_limits_enabled;
2104 bs->io_limits_enabled = false;
2105 ret = bdrv_read(bs, 0, buf, 1);
2106 bs->io_limits_enabled = enabled;
2107 if (ret < 0)
2108 return -1;
2109 /* test msdos magic */
2110 if (buf[510] != 0x55 || buf[511] != 0xaa)
2111 return -1;
2112 for(i = 0; i < 4; i++) {
2113 p = ((struct partition *)(buf + 0x1be)) + i;
2114 nr_sects = le32_to_cpu(p->nr_sects);
2115 if (nr_sects && p->end_head) {
2116 /* We make the assumption that the partition terminates on
2117 a cylinder boundary */
2118 heads = p->end_head + 1;
2119 sectors = p->end_sector & 63;
2120 if (sectors == 0)
2121 continue;
2122 cylinders = nb_sectors / (heads * sectors);
2123 if (cylinders < 1 || cylinders > 16383)
2124 continue;
2125 *pheads = heads;
2126 *psectors = sectors;
2127 *pcylinders = cylinders;
2128 #if 0
2129 printf("guessed geometry: LCHS=%d %d %d\n",
2130 cylinders, heads, sectors);
2131 #endif
2132 return 0;
2133 }
2134 }
2135 return -1;
2136 }
2137
2138 void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
2139 {
2140 int translation, lba_detected = 0;
2141 int cylinders, heads, secs;
2142 uint64_t nb_sectors;
2143
2144 /* if a geometry hint is available, use it */
2145 bdrv_get_geometry(bs, &nb_sectors);
2146 bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
2147 translation = bdrv_get_translation_hint(bs);
2148 if (cylinders != 0) {
2149 *pcyls = cylinders;
2150 *pheads = heads;
2151 *psecs = secs;
2152 } else {
2153 if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
2154 if (heads > 16) {
2155 /* if heads > 16, it means that a BIOS LBA
2156 translation was active, so the default
2157 hardware geometry is OK */
2158 lba_detected = 1;
2159 goto default_geometry;
2160 } else {
2161 *pcyls = cylinders;
2162 *pheads = heads;
2163 *psecs = secs;
2164 /* disable any translation to be in sync with
2165 the logical geometry */
2166 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
2167 bdrv_set_translation_hint(bs,
2168 BIOS_ATA_TRANSLATION_NONE);
2169 }
2170 }
2171 } else {
2172 default_geometry:
2173 /* if no geometry, use a standard physical disk geometry */
2174 cylinders = nb_sectors / (16 * 63);
2175
2176 if (cylinders > 16383)
2177 cylinders = 16383;
2178 else if (cylinders < 2)
2179 cylinders = 2;
2180 *pcyls = cylinders;
2181 *pheads = 16;
2182 *psecs = 63;
2183 if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
2184 if ((*pcyls * *pheads) <= 131072) {
2185 bdrv_set_translation_hint(bs,
2186 BIOS_ATA_TRANSLATION_LARGE);
2187 } else {
2188 bdrv_set_translation_hint(bs,
2189 BIOS_ATA_TRANSLATION_LBA);
2190 }
2191 }
2192 }
2193 bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
2194 }
2195 }
2196
2197 void bdrv_set_geometry_hint(BlockDriverState *bs,
2198 int cyls, int heads, int secs)
2199 {
2200 bs->cyls = cyls;
2201 bs->heads = heads;
2202 bs->secs = secs;
2203 }
2204
2205 void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
2206 {
2207 bs->translation = translation;
2208 }
2209
2210 void bdrv_get_geometry_hint(BlockDriverState *bs,
2211 int *pcyls, int *pheads, int *psecs)
2212 {
2213 *pcyls = bs->cyls;
2214 *pheads = bs->heads;
2215 *psecs = bs->secs;
2216 }
2217
2218 /* throttling disk io limits */
2219 void bdrv_set_io_limits(BlockDriverState *bs,
2220 BlockIOLimit *io_limits)
2221 {
2222 bs->io_limits = *io_limits;
2223 bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2224 }
2225
2226 /* Recognize floppy formats */
2227 typedef struct FDFormat {
2228 FDriveType drive;
2229 uint8_t last_sect;
2230 uint8_t max_track;
2231 uint8_t max_head;
2232 FDriveRate rate;
2233 } FDFormat;
2234
2235 static const FDFormat fd_formats[] = {
2236 /* First entry is default format */
2237 /* 1.44 MB 3"1/2 floppy disks */
2238 { FDRIVE_DRV_144, 18, 80, 1, FDRIVE_RATE_500K, },
2239 { FDRIVE_DRV_144, 20, 80, 1, FDRIVE_RATE_500K, },
2240 { FDRIVE_DRV_144, 21, 80, 1, FDRIVE_RATE_500K, },
2241 { FDRIVE_DRV_144, 21, 82, 1, FDRIVE_RATE_500K, },
2242 { FDRIVE_DRV_144, 21, 83, 1, FDRIVE_RATE_500K, },
2243 { FDRIVE_DRV_144, 22, 80, 1, FDRIVE_RATE_500K, },
2244 { FDRIVE_DRV_144, 23, 80, 1, FDRIVE_RATE_500K, },
2245 { FDRIVE_DRV_144, 24, 80, 1, FDRIVE_RATE_500K, },
2246 /* 2.88 MB 3"1/2 floppy disks */
2247 { FDRIVE_DRV_288, 36, 80, 1, FDRIVE_RATE_1M, },
2248 { FDRIVE_DRV_288, 39, 80, 1, FDRIVE_RATE_1M, },
2249 { FDRIVE_DRV_288, 40, 80, 1, FDRIVE_RATE_1M, },
2250 { FDRIVE_DRV_288, 44, 80, 1, FDRIVE_RATE_1M, },
2251 { FDRIVE_DRV_288, 48, 80, 1, FDRIVE_RATE_1M, },
2252 /* 720 kB 3"1/2 floppy disks */
2253 { FDRIVE_DRV_144, 9, 80, 1, FDRIVE_RATE_250K, },
2254 { FDRIVE_DRV_144, 10, 80, 1, FDRIVE_RATE_250K, },
2255 { FDRIVE_DRV_144, 10, 82, 1, FDRIVE_RATE_250K, },
2256 { FDRIVE_DRV_144, 10, 83, 1, FDRIVE_RATE_250K, },
2257 { FDRIVE_DRV_144, 13, 80, 1, FDRIVE_RATE_250K, },
2258 { FDRIVE_DRV_144, 14, 80, 1, FDRIVE_RATE_250K, },
2259 /* 1.2 MB 5"1/4 floppy disks */
2260 { FDRIVE_DRV_120, 15, 80, 1, FDRIVE_RATE_500K, },
2261 { FDRIVE_DRV_120, 18, 80, 1, FDRIVE_RATE_500K, },
2262 { FDRIVE_DRV_120, 18, 82, 1, FDRIVE_RATE_500K, },
2263 { FDRIVE_DRV_120, 18, 83, 1, FDRIVE_RATE_500K, },
2264 { FDRIVE_DRV_120, 20, 80, 1, FDRIVE_RATE_500K, },
2265 /* 720 kB 5"1/4 floppy disks */
2266 { FDRIVE_DRV_120, 9, 80, 1, FDRIVE_RATE_250K, },
2267 { FDRIVE_DRV_120, 11, 80, 1, FDRIVE_RATE_250K, },
2268 /* 360 kB 5"1/4 floppy disks */
2269 { FDRIVE_DRV_120, 9, 40, 1, FDRIVE_RATE_300K, },
2270 { FDRIVE_DRV_120, 9, 40, 0, FDRIVE_RATE_300K, },
2271 { FDRIVE_DRV_120, 10, 41, 1, FDRIVE_RATE_300K, },
2272 { FDRIVE_DRV_120, 10, 42, 1, FDRIVE_RATE_300K, },
2273 /* 320 kB 5"1/4 floppy disks */
2274 { FDRIVE_DRV_120, 8, 40, 1, FDRIVE_RATE_250K, },
2275 { FDRIVE_DRV_120, 8, 40, 0, FDRIVE_RATE_250K, },
2276 /* 360 kB must match 5"1/4 better than 3"1/2... */
2277 { FDRIVE_DRV_144, 9, 80, 0, FDRIVE_RATE_250K, },
2278 /* end */
2279 { FDRIVE_DRV_NONE, -1, -1, 0, 0, },
2280 };
2281
2282 void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
2283 int *max_track, int *last_sect,
2284 FDriveType drive_in, FDriveType *drive,
2285 FDriveRate *rate)
2286 {
2287 const FDFormat *parse;
2288 uint64_t nb_sectors, size;
2289 int i, first_match, match;
2290
2291 bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
2292 if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
2293 /* User defined disk */
2294 *rate = FDRIVE_RATE_500K;
2295 } else {
2296 bdrv_get_geometry(bs, &nb_sectors);
2297 match = -1;
2298 first_match = -1;
2299 for (i = 0; ; i++) {
2300 parse = &fd_formats[i];
2301 if (parse->drive == FDRIVE_DRV_NONE) {
2302 break;
2303 }
2304 if (drive_in == parse->drive ||
2305 drive_in == FDRIVE_DRV_NONE) {
2306 size = (parse->max_head + 1) * parse->max_track *
2307 parse->last_sect;
2308 if (nb_sectors == size) {
2309 match = i;
2310 break;
2311 }
2312 if (first_match == -1) {
2313 first_match = i;
2314 }
2315 }
2316 }
2317 if (match == -1) {
2318 if (first_match == -1) {
2319 match = 1;
2320 } else {
2321 match = first_match;
2322 }
2323 parse = &fd_formats[match];
2324 }
2325 *nb_heads = parse->max_head + 1;
2326 *max_track = parse->max_track;
2327 *last_sect = parse->last_sect;
2328 *drive = parse->drive;
2329 *rate = parse->rate;
2330 }
2331 }
2332
2333 int bdrv_get_translation_hint(BlockDriverState *bs)
2334 {
2335 return bs->translation;
2336 }
2337
2338 void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2339 BlockErrorAction on_write_error)
2340 {
2341 bs->on_read_error = on_read_error;
2342 bs->on_write_error = on_write_error;
2343 }
2344
2345 BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2346 {
2347 return is_read ? bs->on_read_error : bs->on_write_error;
2348 }
2349
2350 int bdrv_is_read_only(BlockDriverState *bs)
2351 {
2352 return bs->read_only;
2353 }
2354
2355 int bdrv_is_sg(BlockDriverState *bs)
2356 {
2357 return bs->sg;
2358 }
2359
2360 int bdrv_enable_write_cache(BlockDriverState *bs)
2361 {
2362 return bs->enable_write_cache;
2363 }
2364
2365 int bdrv_is_encrypted(BlockDriverState *bs)
2366 {
2367 if (bs->backing_hd && bs->backing_hd->encrypted)
2368 return 1;
2369 return bs->encrypted;
2370 }
2371
2372 int bdrv_key_required(BlockDriverState *bs)
2373 {
2374 BlockDriverState *backing_hd = bs->backing_hd;
2375
2376 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2377 return 1;
2378 return (bs->encrypted && !bs->valid_key);
2379 }
2380
2381 int bdrv_set_key(BlockDriverState *bs, const char *key)
2382 {
2383 int ret;
2384 if (bs->backing_hd && bs->backing_hd->encrypted) {
2385 ret = bdrv_set_key(bs->backing_hd, key);
2386 if (ret < 0)
2387 return ret;
2388 if (!bs->encrypted)
2389 return 0;
2390 }
2391 if (!bs->encrypted) {
2392 return -EINVAL;
2393 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2394 return -ENOMEDIUM;
2395 }
2396 ret = bs->drv->bdrv_set_key(bs, key);
2397 if (ret < 0) {
2398 bs->valid_key = 0;
2399 } else if (!bs->valid_key) {
2400 bs->valid_key = 1;
2401 /* call the change callback now, we skipped it on open */
2402 bdrv_dev_change_media_cb(bs, true);
2403 }
2404 return ret;
2405 }
2406
2407 void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
2408 {
2409 if (!bs->drv) {
2410 buf[0] = '\0';
2411 } else {
2412 pstrcpy(buf, buf_size, bs->drv->format_name);
2413 }
2414 }
2415
2416 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2417 void *opaque)
2418 {
2419 BlockDriver *drv;
2420
2421 QLIST_FOREACH(drv, &bdrv_drivers, list) {
2422 it(opaque, drv->format_name);
2423 }
2424 }
2425
2426 BlockDriverState *bdrv_find(const char *name)
2427 {
2428 BlockDriverState *bs;
2429
2430 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2431 if (!strcmp(name, bs->device_name)) {
2432 return bs;
2433 }
2434 }
2435 return NULL;
2436 }
2437
2438 BlockDriverState *bdrv_next(BlockDriverState *bs)
2439 {
2440 if (!bs) {
2441 return QTAILQ_FIRST(&bdrv_states);
2442 }
2443 return QTAILQ_NEXT(bs, list);
2444 }
2445
2446 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2447 {
2448 BlockDriverState *bs;
2449
2450 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2451 it(opaque, bs);
2452 }
2453 }
2454
2455 const char *bdrv_get_device_name(BlockDriverState *bs)
2456 {
2457 return bs->device_name;
2458 }
2459
2460 void bdrv_flush_all(void)
2461 {
2462 BlockDriverState *bs;
2463
2464 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2465 bdrv_flush(bs);
2466 }
2467 }
2468
2469 int bdrv_has_zero_init(BlockDriverState *bs)
2470 {
2471 assert(bs->drv);
2472
2473 if (bs->drv->bdrv_has_zero_init) {
2474 return bs->drv->bdrv_has_zero_init(bs);
2475 }
2476
2477 return 1;
2478 }
2479
2480 typedef struct BdrvCoIsAllocatedData {
2481 BlockDriverState *bs;
2482 int64_t sector_num;
2483 int nb_sectors;
2484 int *pnum;
2485 int ret;
2486 bool done;
2487 } BdrvCoIsAllocatedData;
2488
2489 /*
2490 * Returns true iff the specified sector is present in the disk image. Drivers
2491 * not implementing the functionality are assumed to not support backing files,
2492 * hence all their sectors are reported as allocated.
2493 *
2494 * If 'sector_num' is beyond the end of the disk image the return value is 0
2495 * and 'pnum' is set to 0.
2496 *
2497 * 'pnum' is set to the number of sectors (including and immediately following
2498 * the specified sector) that are known to be in the same
2499 * allocated/unallocated state.
2500 *
2501 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
2502 * beyond the end of the disk image it will be clamped.
2503 */
2504 int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2505 int nb_sectors, int *pnum)
2506 {
2507 int64_t n;
2508
2509 if (sector_num >= bs->total_sectors) {
2510 *pnum = 0;
2511 return 0;
2512 }
2513
2514 n = bs->total_sectors - sector_num;
2515 if (n < nb_sectors) {
2516 nb_sectors = n;
2517 }
2518
2519 if (!bs->drv->bdrv_co_is_allocated) {
2520 *pnum = nb_sectors;
2521 return 1;
2522 }
2523
2524 return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2525 }
2526
2527 /* Coroutine wrapper for bdrv_is_allocated() */
2528 static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2529 {
2530 BdrvCoIsAllocatedData *data = opaque;
2531 BlockDriverState *bs = data->bs;
2532
2533 data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2534 data->pnum);
2535 data->done = true;
2536 }
2537
2538 /*
2539 * Synchronous wrapper around bdrv_co_is_allocated().
2540 *
2541 * See bdrv_co_is_allocated() for details.
2542 */
2543 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2544 int *pnum)
2545 {
2546 Coroutine *co;
2547 BdrvCoIsAllocatedData data = {
2548 .bs = bs,
2549 .sector_num = sector_num,
2550 .nb_sectors = nb_sectors,
2551 .pnum = pnum,
2552 .done = false,
2553 };
2554
2555 co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2556 qemu_coroutine_enter(co, &data);
2557 while (!data.done) {
2558 qemu_aio_wait();
2559 }
2560 return data.ret;
2561 }
2562
2563 BlockInfoList *qmp_query_block(Error **errp)
2564 {
2565 BlockInfoList *head = NULL, *cur_item = NULL;
2566 BlockDriverState *bs;
2567
2568 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2569 BlockInfoList *info = g_malloc0(sizeof(*info));
2570
2571 info->value = g_malloc0(sizeof(*info->value));
2572 info->value->device = g_strdup(bs->device_name);
2573 info->value->type = g_strdup("unknown");
2574 info->value->locked = bdrv_dev_is_medium_locked(bs);
2575 info->value->removable = bdrv_dev_has_removable_media(bs);
2576
2577 if (bdrv_dev_has_removable_media(bs)) {
2578 info->value->has_tray_open = true;
2579 info->value->tray_open = bdrv_dev_is_tray_open(bs);
2580 }
2581
2582 if (bdrv_iostatus_is_enabled(bs)) {
2583 info->value->has_io_status = true;
2584 info->value->io_status = bs->iostatus;
2585 }
2586
2587 if (bs->drv) {
2588 info->value->has_inserted = true;
2589 info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2590 info->value->inserted->file = g_strdup(bs->filename);
2591 info->value->inserted->ro = bs->read_only;
2592 info->value->inserted->drv = g_strdup(bs->drv->format_name);
2593 info->value->inserted->encrypted = bs->encrypted;
2594 if (bs->backing_file[0]) {
2595 info->value->inserted->has_backing_file = true;
2596 info->value->inserted->backing_file = g_strdup(bs->backing_file);
2597 }
2598
2599 if (bs->io_limits_enabled) {
2600 info->value->inserted->bps =
2601 bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2602 info->value->inserted->bps_rd =
2603 bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2604 info->value->inserted->bps_wr =
2605 bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2606 info->value->inserted->iops =
2607 bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2608 info->value->inserted->iops_rd =
2609 bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2610 info->value->inserted->iops_wr =
2611 bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2612 }
2613 }
2614
2615 /* XXX: waiting for the qapi to support GSList */
2616 if (!cur_item) {
2617 head = cur_item = info;
2618 } else {
2619 cur_item->next = info;
2620 cur_item = info;
2621 }
2622 }
2623
2624 return head;
2625 }
2626
2627 /* Consider exposing this as a full fledged QMP command */
2628 static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2629 {
2630 BlockStats *s;
2631
2632 s = g_malloc0(sizeof(*s));
2633
2634 if (bs->device_name[0]) {
2635 s->has_device = true;
2636 s->device = g_strdup(bs->device_name);
2637 }
2638
2639 s->stats = g_malloc0(sizeof(*s->stats));
2640 s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2641 s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2642 s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2643 s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2644 s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2645 s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2646 s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2647 s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2648 s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2649
2650 if (bs->file) {
2651 s->has_parent = true;
2652 s->parent = qmp_query_blockstat(bs->file, NULL);
2653 }
2654
2655 return s;
2656 }
2657
2658 BlockStatsList *qmp_query_blockstats(Error **errp)
2659 {
2660 BlockStatsList *head = NULL, *cur_item = NULL;
2661 BlockDriverState *bs;
2662
2663 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2664 BlockStatsList *info = g_malloc0(sizeof(*info));
2665 info->value = qmp_query_blockstat(bs, NULL);
2666
2667 /* XXX: waiting for the qapi to support GSList */
2668 if (!cur_item) {
2669 head = cur_item = info;
2670 } else {
2671 cur_item->next = info;
2672 cur_item = info;
2673 }
2674 }
2675
2676 return head;
2677 }
2678
2679 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2680 {
2681 if (bs->backing_hd && bs->backing_hd->encrypted)
2682 return bs->backing_file;
2683 else if (bs->encrypted)
2684 return bs->filename;
2685 else
2686 return NULL;
2687 }
2688
2689 void bdrv_get_backing_filename(BlockDriverState *bs,
2690 char *filename, int filename_size)
2691 {
2692 pstrcpy(filename, filename_size, bs->backing_file);
2693 }
2694
2695 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2696 const uint8_t *buf, int nb_sectors)
2697 {
2698 BlockDriver *drv = bs->drv;
2699 if (!drv)
2700 return -ENOMEDIUM;
2701 if (!drv->bdrv_write_compressed)
2702 return -ENOTSUP;
2703 if (bdrv_check_request(bs, sector_num, nb_sectors))
2704 return -EIO;
2705
2706 if (bs->dirty_bitmap) {
2707 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2708 }
2709
2710 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2711 }
2712
2713 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2714 {
2715 BlockDriver *drv = bs->drv;
2716 if (!drv)
2717 return -ENOMEDIUM;
2718 if (!drv->bdrv_get_info)
2719 return -ENOTSUP;
2720 memset(bdi, 0, sizeof(*bdi));
2721 return drv->bdrv_get_info(bs, bdi);
2722 }
2723
2724 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2725 int64_t pos, int size)
2726 {
2727 BlockDriver *drv = bs->drv;
2728 if (!drv)
2729 return -ENOMEDIUM;
2730 if (drv->bdrv_save_vmstate)
2731 return drv->bdrv_save_vmstate(bs, buf, pos, size);
2732 if (bs->file)
2733 return bdrv_save_vmstate(bs->file, buf, pos, size);
2734 return -ENOTSUP;
2735 }
2736
2737 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2738 int64_t pos, int size)
2739 {
2740 BlockDriver *drv = bs->drv;
2741 if (!drv)
2742 return -ENOMEDIUM;
2743 if (drv->bdrv_load_vmstate)
2744 return drv->bdrv_load_vmstate(bs, buf, pos, size);
2745 if (bs->file)
2746 return bdrv_load_vmstate(bs->file, buf, pos, size);
2747 return -ENOTSUP;
2748 }
2749
2750 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2751 {
2752 BlockDriver *drv = bs->drv;
2753
2754 if (!drv || !drv->bdrv_debug_event) {
2755 return;
2756 }
2757
2758 return drv->bdrv_debug_event(bs, event);
2759
2760 }
2761
2762 /**************************************************************/
2763 /* handling of snapshots */
2764
2765 int bdrv_can_snapshot(BlockDriverState *bs)
2766 {
2767 BlockDriver *drv = bs->drv;
2768 if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2769 return 0;
2770 }
2771
2772 if (!drv->bdrv_snapshot_create) {
2773 if (bs->file != NULL) {
2774 return bdrv_can_snapshot(bs->file);
2775 }
2776 return 0;
2777 }
2778
2779 return 1;
2780 }
2781
2782 int bdrv_is_snapshot(BlockDriverState *bs)
2783 {
2784 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2785 }
2786
2787 BlockDriverState *bdrv_snapshots(void)
2788 {
2789 BlockDriverState *bs;
2790
2791 if (bs_snapshots) {
2792 return bs_snapshots;
2793 }
2794
2795 bs = NULL;
2796 while ((bs = bdrv_next(bs))) {
2797 if (bdrv_can_snapshot(bs)) {
2798 bs_snapshots = bs;
2799 return bs;
2800 }
2801 }
2802 return NULL;
2803 }
2804
2805 int bdrv_snapshot_create(BlockDriverState *bs,
2806 QEMUSnapshotInfo *sn_info)
2807 {
2808 BlockDriver *drv = bs->drv;
2809 if (!drv)
2810 return -ENOMEDIUM;
2811 if (drv->bdrv_snapshot_create)
2812 return drv->bdrv_snapshot_create(bs, sn_info);
2813 if (bs->file)
2814 return bdrv_snapshot_create(bs->file, sn_info);
2815 return -ENOTSUP;
2816 }
2817
2818 int bdrv_snapshot_goto(BlockDriverState *bs,
2819 const char *snapshot_id)
2820 {
2821 BlockDriver *drv = bs->drv;
2822 int ret, open_ret;
2823
2824 if (!drv)
2825 return -ENOMEDIUM;
2826 if (drv->bdrv_snapshot_goto)
2827 return drv->bdrv_snapshot_goto(bs, snapshot_id);
2828
2829 if (bs->file) {
2830 drv->bdrv_close(bs);
2831 ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2832 open_ret = drv->bdrv_open(bs, bs->open_flags);
2833 if (open_ret < 0) {
2834 bdrv_delete(bs->file);
2835 bs->drv = NULL;
2836 return open_ret;
2837 }
2838 return ret;
2839 }
2840
2841 return -ENOTSUP;
2842 }
2843
2844 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2845 {
2846 BlockDriver *drv = bs->drv;
2847 if (!drv)
2848 return -ENOMEDIUM;
2849 if (drv->bdrv_snapshot_delete)
2850 return drv->bdrv_snapshot_delete(bs, snapshot_id);
2851 if (bs->file)
2852 return bdrv_snapshot_delete(bs->file, snapshot_id);
2853 return -ENOTSUP;
2854 }
2855
2856 int bdrv_snapshot_list(BlockDriverState *bs,
2857 QEMUSnapshotInfo **psn_info)
2858 {
2859 BlockDriver *drv = bs->drv;
2860 if (!drv)
2861 return -ENOMEDIUM;
2862 if (drv->bdrv_snapshot_list)
2863 return drv->bdrv_snapshot_list(bs, psn_info);
2864 if (bs->file)
2865 return bdrv_snapshot_list(bs->file, psn_info);
2866 return -ENOTSUP;
2867 }
2868
2869 int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2870 const char *snapshot_name)
2871 {
2872 BlockDriver *drv = bs->drv;
2873 if (!drv) {
2874 return -ENOMEDIUM;
2875 }
2876 if (!bs->read_only) {
2877 return -EINVAL;
2878 }
2879 if (drv->bdrv_snapshot_load_tmp) {
2880 return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2881 }
2882 return -ENOTSUP;
2883 }
2884
2885 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2886 const char *backing_file)
2887 {
2888 if (!bs->drv) {
2889 return NULL;
2890 }
2891
2892 if (bs->backing_hd) {
2893 if (strcmp(bs->backing_file, backing_file) == 0) {
2894 return bs->backing_hd;
2895 } else {
2896 return bdrv_find_backing_image(bs->backing_hd, backing_file);
2897 }
2898 }
2899
2900 return NULL;
2901 }
2902
2903 #define NB_SUFFIXES 4
2904
2905 char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2906 {
2907 static const char suffixes[NB_SUFFIXES] = "KMGT";
2908 int64_t base;
2909 int i;
2910
2911 if (size <= 999) {
2912 snprintf(buf, buf_size, "%" PRId64, size);
2913 } else {
2914 base = 1024;
2915 for(i = 0; i < NB_SUFFIXES; i++) {
2916 if (size < (10 * base)) {
2917 snprintf(buf, buf_size, "%0.1f%c",
2918 (double)size / base,
2919 suffixes[i]);
2920 break;
2921 } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2922 snprintf(buf, buf_size, "%" PRId64 "%c",
2923 ((size + (base >> 1)) / base),
2924 suffixes[i]);
2925 break;
2926 }
2927 base = base * 1024;
2928 }
2929 }
2930 return buf;
2931 }
2932
2933 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2934 {
2935 char buf1[128], date_buf[128], clock_buf[128];
2936 #ifdef _WIN32
2937 struct tm *ptm;
2938 #else
2939 struct tm tm;
2940 #endif
2941 time_t ti;
2942 int64_t secs;
2943
2944 if (!sn) {
2945 snprintf(buf, buf_size,
2946 "%-10s%-20s%7s%20s%15s",
2947 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2948 } else {
2949 ti = sn->date_sec;
2950 #ifdef _WIN32
2951 ptm = localtime(&ti);
2952 strftime(date_buf, sizeof(date_buf),
2953 "%Y-%m-%d %H:%M:%S", ptm);
2954 #else
2955 localtime_r(&ti, &tm);
2956 strftime(date_buf, sizeof(date_buf),
2957 "%Y-%m-%d %H:%M:%S", &tm);
2958 #endif
2959 secs = sn->vm_clock_nsec / 1000000000;
2960 snprintf(clock_buf, sizeof(clock_buf),
2961 "%02d:%02d:%02d.%03d",
2962 (int)(secs / 3600),
2963 (int)((secs / 60) % 60),
2964 (int)(secs % 60),
2965 (int)((sn->vm_clock_nsec / 1000000) % 1000));
2966 snprintf(buf, buf_size,
2967 "%-10s%-20s%7s%20s%15s",
2968 sn->id_str, sn->name,
2969 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2970 date_buf,
2971 clock_buf);
2972 }
2973 return buf;
2974 }
2975
2976 /**************************************************************/
2977 /* async I/Os */
2978
2979 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
2980 QEMUIOVector *qiov, int nb_sectors,
2981 BlockDriverCompletionFunc *cb, void *opaque)
2982 {
2983 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2984
2985 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2986 cb, opaque, false);
2987 }
2988
2989 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2990 QEMUIOVector *qiov, int nb_sectors,
2991 BlockDriverCompletionFunc *cb, void *opaque)
2992 {
2993 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2994
2995 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2996 cb, opaque, true);
2997 }
2998
2999
3000 typedef struct MultiwriteCB {
3001 int error;
3002 int num_requests;
3003 int num_callbacks;
3004 struct {
3005 BlockDriverCompletionFunc *cb;
3006 void *opaque;
3007 QEMUIOVector *free_qiov;
3008 } callbacks[];
3009 } MultiwriteCB;
3010
3011 static void multiwrite_user_cb(MultiwriteCB *mcb)
3012 {
3013 int i;
3014
3015 for (i = 0; i < mcb->num_callbacks; i++) {
3016 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
3017 if (mcb->callbacks[i].free_qiov) {
3018 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3019 }
3020 g_free(mcb->callbacks[i].free_qiov);
3021 }
3022 }
3023
3024 static void multiwrite_cb(void *opaque, int ret)
3025 {
3026 MultiwriteCB *mcb = opaque;
3027
3028 trace_multiwrite_cb(mcb, ret);
3029
3030 if (ret < 0 && !mcb->error) {
3031 mcb->error = ret;
3032 }
3033
3034 mcb->num_requests--;
3035 if (mcb->num_requests == 0) {
3036 multiwrite_user_cb(mcb);
3037 g_free(mcb);
3038 }
3039 }
3040
3041 static int multiwrite_req_compare(const void *a, const void *b)
3042 {
3043 const BlockRequest *req1 = a, *req2 = b;
3044
3045 /*
3046 * Note that we can't simply subtract req2->sector from req1->sector
3047 * here as that could overflow the return value.
3048 */
3049 if (req1->sector > req2->sector) {
3050 return 1;
3051 } else if (req1->sector < req2->sector) {
3052 return -1;
3053 } else {
3054 return 0;
3055 }
3056 }
3057
3058 /*
3059 * Takes a bunch of requests and tries to merge them. Returns the number of
3060 * requests that remain after merging.
3061 */
3062 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3063 int num_reqs, MultiwriteCB *mcb)
3064 {
3065 int i, outidx;
3066
3067 // Sort requests by start sector
3068 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3069
3070 // Check if adjacent requests touch the same clusters. If so, combine them,
3071 // filling up gaps with zero sectors.
3072 outidx = 0;
3073 for (i = 1; i < num_reqs; i++) {
3074 int merge = 0;
3075 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3076
3077 // Handle exactly sequential writes and overlapping writes.
3078 if (reqs[i].sector <= oldreq_last) {
3079 merge = 1;
3080 }
3081
3082 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3083 merge = 0;
3084 }
3085
3086 if (merge) {
3087 size_t size;
3088 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3089 qemu_iovec_init(qiov,
3090 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3091
3092 // Add the first request to the merged one. If the requests are
3093 // overlapping, drop the last sectors of the first request.
3094 size = (reqs[i].sector - reqs[outidx].sector) << 9;
3095 qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
3096
3097 // We should need to add any zeros between the two requests
3098 assert (reqs[i].sector <= oldreq_last);
3099
3100 // Add the second request
3101 qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
3102
3103 reqs[outidx].nb_sectors = qiov->size >> 9;
3104 reqs[outidx].qiov = qiov;
3105
3106 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3107 } else {
3108 outidx++;
3109 reqs[outidx].sector = reqs[i].sector;
3110 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3111 reqs[outidx].qiov = reqs[i].qiov;
3112 }
3113 }
3114
3115 return outidx + 1;
3116 }
3117
3118 /*
3119 * Submit multiple AIO write requests at once.
3120 *
3121 * On success, the function returns 0 and all requests in the reqs array have
3122 * been submitted. In error case this function returns -1, and any of the
3123 * requests may or may not be submitted yet. In particular, this means that the
3124 * callback will be called for some of the requests, for others it won't. The
3125 * caller must check the error field of the BlockRequest to wait for the right
3126 * callbacks (if error != 0, no callback will be called).
3127 *
3128 * The implementation may modify the contents of the reqs array, e.g. to merge
3129 * requests. However, the fields opaque and error are left unmodified as they
3130 * are used to signal failure for a single request to the caller.
3131 */
3132 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3133 {
3134 MultiwriteCB *mcb;
3135 int i;
3136
3137 /* don't submit writes if we don't have a medium */
3138 if (bs->drv == NULL) {
3139 for (i = 0; i < num_reqs; i++) {
3140 reqs[i].error = -ENOMEDIUM;
3141 }
3142 return -1;
3143 }
3144
3145 if (num_reqs == 0) {
3146 return 0;
3147 }
3148
3149 // Create MultiwriteCB structure
3150 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3151 mcb->num_requests = 0;
3152 mcb->num_callbacks = num_reqs;
3153
3154 for (i = 0; i < num_reqs; i++) {
3155 mcb->callbacks[i].cb = reqs[i].cb;
3156 mcb->callbacks[i].opaque = reqs[i].opaque;
3157 }
3158
3159 // Check for mergable requests
3160 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3161
3162 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3163
3164 /* Run the aio requests. */
3165 mcb->num_requests = num_reqs;
3166 for (i = 0; i < num_reqs; i++) {
3167 bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3168 reqs[i].nb_sectors, multiwrite_cb, mcb);
3169 }
3170
3171 return 0;
3172 }
3173
3174 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3175 {
3176 acb->pool->cancel(acb);
3177 }
3178
3179 /* block I/O throttling */
3180 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3181 bool is_write, double elapsed_time, uint64_t *wait)
3182 {
3183 uint64_t bps_limit = 0;
3184 double bytes_limit, bytes_base, bytes_res;
3185 double slice_time, wait_time;
3186
3187 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3188 bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3189 } else if (bs->io_limits.bps[is_write]) {
3190 bps_limit = bs->io_limits.bps[is_write];
3191 } else {
3192 if (wait) {
3193 *wait = 0;
3194 }
3195
3196 return false;
3197 }
3198
3199 slice_time = bs->slice_end - bs->slice_start;
3200 slice_time /= (NANOSECONDS_PER_SECOND);
3201 bytes_limit = bps_limit * slice_time;
3202 bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3203 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3204 bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3205 }
3206
3207 /* bytes_base: the bytes of data which have been read/written; and
3208 * it is obtained from the history statistic info.
3209 * bytes_res: the remaining bytes of data which need to be read/written.
3210 * (bytes_base + bytes_res) / bps_limit: used to calcuate
3211 * the total time for completing reading/writting all data.
3212 */
3213 bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3214
3215 if (bytes_base + bytes_res <= bytes_limit) {
3216 if (wait) {
3217 *wait = 0;
3218 }
3219
3220 return false;
3221 }
3222
3223 /* Calc approx time to dispatch */
3224 wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3225
3226 /* When the I/O rate at runtime exceeds the limits,
3227 * bs->slice_end need to be extended in order that the current statistic
3228 * info can be kept until the timer fire, so it is increased and tuned
3229 * based on the result of experiment.
3230 */
3231 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3232 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3233 if (wait) {
3234 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3235 }
3236
3237 return true;
3238 }
3239
3240 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3241 double elapsed_time, uint64_t *wait)
3242 {
3243 uint64_t iops_limit = 0;
3244 double ios_limit, ios_base;
3245 double slice_time, wait_time;
3246
3247 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3248 iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3249 } else if (bs->io_limits.iops[is_write]) {
3250 iops_limit = bs->io_limits.iops[is_write];
3251 } else {
3252 if (wait) {
3253 *wait = 0;
3254 }
3255
3256 return false;
3257 }
3258
3259 slice_time = bs->slice_end - bs->slice_start;
3260 slice_time /= (NANOSECONDS_PER_SECOND);
3261 ios_limit = iops_limit * slice_time;
3262 ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3263 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3264 ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3265 }
3266
3267 if (ios_base + 1 <= ios_limit) {
3268 if (wait) {
3269 *wait = 0;
3270 }
3271
3272 return false;
3273 }
3274
3275 /* Calc approx time to dispatch */
3276 wait_time = (ios_base + 1) / iops_limit;
3277 if (wait_time > elapsed_time) {
3278 wait_time = wait_time - elapsed_time;
3279 } else {
3280 wait_time = 0;
3281 }
3282
3283 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3284 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3285 if (wait) {
3286 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3287 }
3288
3289 return true;
3290 }
3291
3292 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3293 bool is_write, int64_t *wait)
3294 {
3295 int64_t now, max_wait;
3296 uint64_t bps_wait = 0, iops_wait = 0;
3297 double elapsed_time;
3298 int bps_ret, iops_ret;
3299
3300 now = qemu_get_clock_ns(vm_clock);
3301 if ((bs->slice_start < now)
3302 && (bs->slice_end > now)) {
3303 bs->slice_end = now + bs->slice_time;
3304 } else {
3305 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
3306 bs->slice_start = now;
3307 bs->slice_end = now + bs->slice_time;
3308
3309 bs->io_base.bytes[is_write] = bs->nr_bytes[is_write];
3310 bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3311
3312 bs->io_base.ios[is_write] = bs->nr_ops[is_write];
3313 bs->io_base.ios[!is_write] = bs->nr_ops[!is_write];
3314 }
3315
3316 elapsed_time = now - bs->slice_start;
3317 elapsed_time /= (NANOSECONDS_PER_SECOND);
3318
3319 bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors,
3320 is_write, elapsed_time, &bps_wait);
3321 iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3322 elapsed_time, &iops_wait);
3323 if (bps_ret || iops_ret) {
3324 max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3325 if (wait) {
3326 *wait = max_wait;
3327 }
3328
3329 now = qemu_get_clock_ns(vm_clock);
3330 if (bs->slice_end < now + max_wait) {
3331 bs->slice_end = now + max_wait;
3332 }
3333
3334 return true;
3335 }
3336
3337 if (wait) {
3338 *wait = 0;
3339 }
3340
3341 return false;
3342 }
3343
3344 /**************************************************************/
3345 /* async block device emulation */
3346
3347 typedef struct BlockDriverAIOCBSync {
3348 BlockDriverAIOCB common;
3349 QEMUBH *bh;
3350 int ret;
3351 /* vector translation state */
3352 QEMUIOVector *qiov;
3353 uint8_t *bounce;
3354 int is_write;
3355 } BlockDriverAIOCBSync;
3356
3357 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3358 {
3359 BlockDriverAIOCBSync *acb =
3360 container_of(blockacb, BlockDriverAIOCBSync, common);
3361 qemu_bh_delete(acb->bh);
3362 acb->bh = NULL;
3363 qemu_aio_release(acb);
3364 }
3365
3366 static AIOPool bdrv_em_aio_pool = {
3367 .aiocb_size = sizeof(BlockDriverAIOCBSync),
3368 .cancel = bdrv_aio_cancel_em,
3369 };
3370
3371 static void bdrv_aio_bh_cb(void *opaque)
3372 {
3373 BlockDriverAIOCBSync *acb = opaque;
3374
3375 if (!acb->is_write)
3376 qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
3377 qemu_vfree(acb->bounce);
3378 acb->common.cb(acb->common.opaque, acb->ret);
3379 qemu_bh_delete(acb->bh);
3380 acb->bh = NULL;
3381 qemu_aio_release(acb);
3382 }
3383
3384 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3385 int64_t sector_num,
3386 QEMUIOVector *qiov,
3387 int nb_sectors,
3388 BlockDriverCompletionFunc *cb,
3389 void *opaque,
3390 int is_write)
3391
3392 {
3393 BlockDriverAIOCBSync *acb;
3394
3395 acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
3396 acb->is_write = is_write;
3397 acb->qiov = qiov;
3398 acb->bounce = qemu_blockalign(bs, qiov->size);
3399 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3400
3401 if (is_write) {
3402 qemu_iovec_to_buffer(acb->qiov, acb->bounce);
3403 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3404 } else {
3405 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3406 }
3407
3408 qemu_bh_schedule(acb->bh);
3409
3410 return &acb->common;
3411 }
3412
3413 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3414 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3415 BlockDriverCompletionFunc *cb, void *opaque)
3416 {
3417 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3418 }
3419
3420 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3421 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3422 BlockDriverCompletionFunc *cb, void *opaque)
3423 {
3424 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3425 }
3426
3427
3428 typedef struct BlockDriverAIOCBCoroutine {
3429 BlockDriverAIOCB common;
3430 BlockRequest req;
3431 bool is_write;
3432 QEMUBH* bh;
3433 } BlockDriverAIOCBCoroutine;
3434
3435 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3436 {
3437 qemu_aio_flush();
3438 }
3439
3440 static AIOPool bdrv_em_co_aio_pool = {
3441 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
3442 .cancel = bdrv_aio_co_cancel_em,
3443 };
3444
3445 static void bdrv_co_em_bh(void *opaque)
3446 {
3447 BlockDriverAIOCBCoroutine *acb = opaque;
3448
3449 acb->common.cb(acb->common.opaque, acb->req.error);
3450 qemu_bh_delete(acb->bh);
3451 qemu_aio_release(acb);
3452 }
3453
3454 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3455 static void coroutine_fn bdrv_co_do_rw(void *opaque)
3456 {
3457 BlockDriverAIOCBCoroutine *acb = opaque;
3458 BlockDriverState *bs = acb->common.bs;
3459
3460 if (!acb->is_write) {
3461 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3462 acb->req.nb_sectors, acb->req.qiov, 0);
3463 } else {
3464 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3465 acb->req.nb_sectors, acb->req.qiov, 0);
3466 }
3467
3468 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3469 qemu_bh_schedule(acb->bh);
3470 }
3471
3472 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3473 int64_t sector_num,
3474 QEMUIOVector *qiov,
3475 int nb_sectors,
3476 BlockDriverCompletionFunc *cb,
3477 void *opaque,
3478 bool is_write)
3479 {
3480 Coroutine *co;
3481 BlockDriverAIOCBCoroutine *acb;
3482
3483 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3484 acb->req.sector = sector_num;
3485 acb->req.nb_sectors = nb_sectors;
3486 acb->req.qiov = qiov;
3487 acb->is_write = is_write;
3488
3489 co = qemu_coroutine_create(bdrv_co_do_rw);
3490 qemu_coroutine_enter(co, acb);
3491
3492 return &acb->common;
3493 }
3494
3495 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3496 {
3497 BlockDriverAIOCBCoroutine *acb = opaque;
3498 BlockDriverState *bs = acb->common.bs;
3499
3500 acb->req.error = bdrv_co_flush(bs);
3501 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3502 qemu_bh_schedule(acb->bh);
3503 }
3504
3505 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3506 BlockDriverCompletionFunc *cb, void *opaque)
3507 {
3508 trace_bdrv_aio_flush(bs, opaque);
3509
3510 Coroutine *co;
3511 BlockDriverAIOCBCoroutine *acb;
3512
3513 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3514 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3515 qemu_coroutine_enter(co, acb);
3516
3517 return &acb->common;
3518 }
3519
3520 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3521 {
3522 BlockDriverAIOCBCoroutine *acb = opaque;
3523 BlockDriverState *bs = acb->common.bs;
3524
3525 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3526 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3527 qemu_bh_schedule(acb->bh);
3528 }
3529
3530 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3531 int64_t sector_num, int nb_sectors,
3532 BlockDriverCompletionFunc *cb, void *opaque)
3533 {
3534 Coroutine *co;
3535 BlockDriverAIOCBCoroutine *acb;
3536
3537 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3538
3539 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3540 acb->req.sector = sector_num;
3541 acb->req.nb_sectors = nb_sectors;
3542 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3543 qemu_coroutine_enter(co, acb);
3544
3545 return &acb->common;
3546 }
3547
3548 void bdrv_init(void)
3549 {
3550 module_call_init(MODULE_INIT_BLOCK);
3551 }
3552
3553 void bdrv_init_with_whitelist(void)
3554 {
3555 use_bdrv_whitelist = 1;
3556 bdrv_init();
3557 }
3558
3559 void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3560 BlockDriverCompletionFunc *cb, void *opaque)
3561 {
3562 BlockDriverAIOCB *acb;
3563
3564 if (pool->free_aiocb) {
3565 acb = pool->free_aiocb;
3566 pool->free_aiocb = acb->next;
3567 } else {
3568 acb = g_malloc0(pool->aiocb_size);
3569 acb->pool = pool;
3570 }
3571 acb->bs = bs;
3572 acb->cb = cb;
3573 acb->opaque = opaque;
3574 return acb;
3575 }
3576
3577 void qemu_aio_release(void *p)
3578 {
3579 BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3580 AIOPool *pool = acb->pool;
3581 acb->next = pool->free_aiocb;
3582 pool->free_aiocb = acb;
3583 }
3584
3585 /**************************************************************/
3586 /* Coroutine block device emulation */
3587
3588 typedef struct CoroutineIOCompletion {
3589 Coroutine *coroutine;
3590 int ret;
3591 } CoroutineIOCompletion;
3592
3593 static void bdrv_co_io_em_complete(void *opaque, int ret)
3594 {
3595 CoroutineIOCompletion *co = opaque;
3596
3597 co->ret = ret;
3598 qemu_coroutine_enter(co->coroutine, NULL);
3599 }
3600
3601 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3602 int nb_sectors, QEMUIOVector *iov,
3603 bool is_write)
3604 {
3605 CoroutineIOCompletion co = {
3606 .coroutine = qemu_coroutine_self(),
3607 };
3608 BlockDriverAIOCB *acb;
3609
3610 if (is_write) {
3611 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3612 bdrv_co_io_em_complete, &co);
3613 } else {
3614 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3615 bdrv_co_io_em_complete, &co);
3616 }
3617
3618 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3619 if (!acb) {
3620 return -EIO;
3621 }
3622 qemu_coroutine_yield();
3623
3624 return co.ret;
3625 }
3626
3627 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3628 int64_t sector_num, int nb_sectors,
3629 QEMUIOVector *iov)
3630 {
3631 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3632 }
3633
3634 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3635 int64_t sector_num, int nb_sectors,
3636 QEMUIOVector *iov)
3637 {
3638 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3639 }
3640
3641 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3642 {
3643 RwCo *rwco = opaque;
3644
3645 rwco->ret = bdrv_co_flush(rwco->bs);
3646 }
3647
3648 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3649 {
3650 int ret;
3651
3652 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3653 return 0;
3654 }
3655
3656 /* Write back cached data to the OS even with cache=unsafe */
3657 if (bs->drv->bdrv_co_flush_to_os) {
3658 ret = bs->drv->bdrv_co_flush_to_os(bs);
3659 if (ret < 0) {
3660 return ret;
3661 }
3662 }
3663
3664 /* But don't actually force it to the disk with cache=unsafe */
3665 if (bs->open_flags & BDRV_O_NO_FLUSH) {
3666 return 0;
3667 }
3668
3669 if (bs->drv->bdrv_co_flush_to_disk) {
3670 ret = bs->drv->bdrv_co_flush_to_disk(bs);
3671 } else if (bs->drv->bdrv_aio_flush) {
3672 BlockDriverAIOCB *acb;
3673 CoroutineIOCompletion co = {
3674 .coroutine = qemu_coroutine_self(),
3675 };
3676
3677 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3678 if (acb == NULL) {
3679 ret = -EIO;
3680 } else {
3681 qemu_coroutine_yield();
3682 ret = co.ret;
3683 }
3684 } else {
3685 /*
3686 * Some block drivers always operate in either writethrough or unsafe
3687 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3688 * know how the server works (because the behaviour is hardcoded or
3689 * depends on server-side configuration), so we can't ensure that
3690 * everything is safe on disk. Returning an error doesn't work because
3691 * that would break guests even if the server operates in writethrough
3692 * mode.
3693 *
3694 * Let's hope the user knows what he's doing.
3695 */
3696 ret = 0;
3697 }
3698 if (ret < 0) {
3699 return ret;
3700 }
3701
3702 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
3703 * in the case of cache=unsafe, so there are no useless flushes.
3704 */
3705 return bdrv_co_flush(bs->file);
3706 }
3707
3708 void bdrv_invalidate_cache(BlockDriverState *bs)
3709 {
3710 if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3711 bs->drv->bdrv_invalidate_cache(bs);
3712 }
3713 }
3714
3715 void bdrv_invalidate_cache_all(void)
3716 {
3717 BlockDriverState *bs;
3718
3719 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3720 bdrv_invalidate_cache(bs);
3721 }
3722 }
3723
3724 void bdrv_clear_incoming_migration_all(void)
3725 {
3726 BlockDriverState *bs;
3727
3728 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3729 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
3730 }
3731 }
3732
3733 int bdrv_flush(BlockDriverState *bs)
3734 {
3735 Coroutine *co;
3736 RwCo rwco = {
3737 .bs = bs,
3738 .ret = NOT_DONE,
3739 };
3740
3741 if (qemu_in_coroutine()) {
3742 /* Fast-path if already in coroutine context */
3743 bdrv_flush_co_entry(&rwco);
3744 } else {
3745 co = qemu_coroutine_create(bdrv_flush_co_entry);
3746 qemu_coroutine_enter(co, &rwco);
3747 while (rwco.ret == NOT_DONE) {
3748 qemu_aio_wait();
3749 }
3750 }
3751
3752 return rwco.ret;
3753 }
3754
3755 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3756 {
3757 RwCo *rwco = opaque;
3758
3759 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3760 }
3761
3762 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3763 int nb_sectors)
3764 {
3765 if (!bs->drv) {
3766 return -ENOMEDIUM;
3767 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3768 return -EIO;
3769 } else if (bs->read_only) {
3770 return -EROFS;
3771 } else if (bs->drv->bdrv_co_discard) {
3772 return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3773 } else if (bs->drv->bdrv_aio_discard) {
3774 BlockDriverAIOCB *acb;
3775 CoroutineIOCompletion co = {
3776 .coroutine = qemu_coroutine_self(),
3777 };
3778
3779 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3780 bdrv_co_io_em_complete, &co);
3781 if (acb == NULL) {
3782 return -EIO;
3783 } else {
3784 qemu_coroutine_yield();
3785 return co.ret;
3786 }
3787 } else {
3788 return 0;
3789 }
3790 }
3791
3792 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3793 {
3794 Coroutine *co;
3795 RwCo rwco = {
3796 .bs = bs,
3797 .sector_num = sector_num,
3798 .nb_sectors = nb_sectors,
3799 .ret = NOT_DONE,
3800 };
3801
3802 if (qemu_in_coroutine()) {
3803 /* Fast-path if already in coroutine context */
3804 bdrv_discard_co_entry(&rwco);
3805 } else {
3806 co = qemu_coroutine_create(bdrv_discard_co_entry);
3807 qemu_coroutine_enter(co, &rwco);
3808 while (rwco.ret == NOT_DONE) {
3809 qemu_aio_wait();
3810 }
3811 }
3812
3813 return rwco.ret;
3814 }
3815
3816 /**************************************************************/
3817 /* removable device support */
3818
3819 /**
3820 * Return TRUE if the media is present
3821 */
3822 int bdrv_is_inserted(BlockDriverState *bs)
3823 {
3824 BlockDriver *drv = bs->drv;
3825
3826 if (!drv)
3827 return 0;
3828 if (!drv->bdrv_is_inserted)
3829 return 1;
3830 return drv->bdrv_is_inserted(bs);
3831 }
3832
3833 /**
3834 * Return whether the media changed since the last call to this
3835 * function, or -ENOTSUP if we don't know. Most drivers don't know.
3836 */
3837 int bdrv_media_changed(BlockDriverState *bs)
3838 {
3839 BlockDriver *drv = bs->drv;
3840
3841 if (drv && drv->bdrv_media_changed) {
3842 return drv->bdrv_media_changed(bs);
3843 }
3844 return -ENOTSUP;
3845 }
3846
3847 /**
3848 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3849 */
3850 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
3851 {
3852 BlockDriver *drv = bs->drv;
3853
3854 if (drv && drv->bdrv_eject) {
3855 drv->bdrv_eject(bs, eject_flag);
3856 }
3857
3858 if (bs->device_name[0] != '\0') {
3859 bdrv_emit_qmp_eject_event(bs, eject_flag);
3860 }
3861 }
3862
3863 /**
3864 * Lock or unlock the media (if it is locked, the user won't be able
3865 * to eject it manually).
3866 */
3867 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3868 {
3869 BlockDriver *drv = bs->drv;
3870
3871 trace_bdrv_lock_medium(bs, locked);
3872
3873 if (drv && drv->bdrv_lock_medium) {
3874 drv->bdrv_lock_medium(bs, locked);
3875 }
3876 }
3877
3878 /* needed for generic scsi interface */
3879
3880 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3881 {
3882 BlockDriver *drv = bs->drv;
3883
3884 if (drv && drv->bdrv_ioctl)
3885 return drv->bdrv_ioctl(bs, req, buf);
3886 return -ENOTSUP;
3887 }
3888
3889 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3890 unsigned long int req, void *buf,
3891 BlockDriverCompletionFunc *cb, void *opaque)
3892 {
3893 BlockDriver *drv = bs->drv;
3894
3895 if (drv && drv->bdrv_aio_ioctl)
3896 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3897 return NULL;
3898 }
3899
3900 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3901 {
3902 bs->buffer_alignment = align;
3903 }
3904
3905 void *qemu_blockalign(BlockDriverState *bs, size_t size)
3906 {
3907 return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3908 }
3909
3910 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3911 {
3912 int64_t bitmap_size;
3913
3914 bs->dirty_count = 0;
3915 if (enable) {
3916 if (!bs->dirty_bitmap) {
3917 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3918 BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG - 1;
3919 bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG;
3920
3921 bs->dirty_bitmap = g_new0(unsigned long, bitmap_size);
3922 }
3923 } else {
3924 if (bs->dirty_bitmap) {
3925 g_free(bs->dirty_bitmap);
3926 bs->dirty_bitmap = NULL;
3927 }
3928 }
3929 }
3930
3931 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3932 {
3933 int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
3934
3935 if (bs->dirty_bitmap &&
3936 (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
3937 return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3938 (1UL << (chunk % (sizeof(unsigned long) * 8))));
3939 } else {
3940 return 0;
3941 }
3942 }
3943
3944 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3945 int nr_sectors)
3946 {
3947 set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3948 }
3949
3950 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3951 {
3952 return bs->dirty_count;
3953 }
3954
3955 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3956 {
3957 assert(bs->in_use != in_use);
3958 bs->in_use = in_use;
3959 }
3960
3961 int bdrv_in_use(BlockDriverState *bs)
3962 {
3963 return bs->in_use;
3964 }
3965
3966 void bdrv_iostatus_enable(BlockDriverState *bs)
3967 {
3968 bs->iostatus_enabled = true;
3969 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3970 }
3971
3972 /* The I/O status is only enabled if the drive explicitly
3973 * enables it _and_ the VM is configured to stop on errors */
3974 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3975 {
3976 return (bs->iostatus_enabled &&
3977 (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3978 bs->on_write_error == BLOCK_ERR_STOP_ANY ||
3979 bs->on_read_error == BLOCK_ERR_STOP_ANY));
3980 }
3981
3982 void bdrv_iostatus_disable(BlockDriverState *bs)
3983 {
3984 bs->iostatus_enabled = false;
3985 }
3986
3987 void bdrv_iostatus_reset(BlockDriverState *bs)
3988 {
3989 if (bdrv_iostatus_is_enabled(bs)) {
3990 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3991 }
3992 }
3993
3994 /* XXX: Today this is set by device models because it makes the implementation
3995 quite simple. However, the block layer knows about the error, so it's
3996 possible to implement this without device models being involved */
3997 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3998 {
3999 if (bdrv_iostatus_is_enabled(bs) &&
4000 bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
4001 assert(error >= 0);
4002 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
4003 BLOCK_DEVICE_IO_STATUS_FAILED;
4004 }
4005 }
4006
4007 void
4008 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4009 enum BlockAcctType type)
4010 {
4011 assert(type < BDRV_MAX_IOTYPE);
4012
4013 cookie->bytes = bytes;
4014 cookie->start_time_ns = get_clock();
4015 cookie->type = type;
4016 }
4017
4018 void
4019 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4020 {
4021 assert(cookie->type < BDRV_MAX_IOTYPE);
4022
4023 bs->nr_bytes[cookie->type] += cookie->bytes;
4024 bs->nr_ops[cookie->type]++;
4025 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4026 }
4027
4028 int bdrv_img_create(const char *filename, const char *fmt,
4029 const char *base_filename, const char *base_fmt,
4030 char *options, uint64_t img_size, int flags)
4031 {
4032 QEMUOptionParameter *param = NULL, *create_options = NULL;
4033 QEMUOptionParameter *backing_fmt, *backing_file, *size;
4034 BlockDriverState *bs = NULL;
4035 BlockDriver *drv, *proto_drv;
4036 BlockDriver *backing_drv = NULL;
4037 int ret = 0;
4038
4039 /* Find driver and parse its options */
4040 drv = bdrv_find_format(fmt);
4041 if (!drv) {
4042 error_report("Unknown file format '%s'", fmt);
4043 ret = -EINVAL;
4044 goto out;
4045 }
4046
4047 proto_drv = bdrv_find_protocol(filename);
4048 if (!proto_drv) {
4049 error_report("Unknown protocol '%s'", filename);
4050 ret = -EINVAL;
4051 goto out;
4052 }
4053
4054 create_options = append_option_parameters(create_options,
4055 drv->create_options);
4056 create_options = append_option_parameters(create_options,
4057 proto_drv->create_options);
4058
4059 /* Create parameter list with default values */
4060 param = parse_option_parameters("", create_options, param);
4061
4062 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4063
4064 /* Parse -o options */
4065 if (options) {
4066 param = parse_option_parameters(options, create_options, param);
4067 if (param == NULL) {
4068 error_report("Invalid options for file format '%s'.", fmt);
4069 ret = -EINVAL;
4070 goto out;
4071 }
4072 }
4073
4074 if (base_filename) {
4075 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4076 base_filename)) {
4077 error_report("Backing file not supported for file format '%s'",
4078 fmt);
4079 ret = -EINVAL;
4080 goto out;
4081 }
4082 }
4083
4084 if (base_fmt) {
4085 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4086 error_report("Backing file format not supported for file "
4087 "format '%s'", fmt);
4088 ret = -EINVAL;
4089 goto out;
4090 }
4091 }
4092
4093 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4094 if (backing_file && backing_file->value.s) {
4095 if (!strcmp(filename, backing_file->value.s)) {
4096 error_report("Error: Trying to create an image with the "
4097 "same filename as the backing file");
4098 ret = -EINVAL;
4099 goto out;
4100 }
4101 }
4102
4103 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4104 if (backing_fmt && backing_fmt->value.s) {
4105 backing_drv = bdrv_find_format(backing_fmt->value.s);
4106 if (!backing_drv) {
4107 error_report("Unknown backing file format '%s'",
4108 backing_fmt->value.s);
4109 ret = -EINVAL;
4110 goto out;
4111 }
4112 }
4113
4114 // The size for the image must always be specified, with one exception:
4115 // If we are using a backing file, we can obtain the size from there
4116 size = get_option_parameter(param, BLOCK_OPT_SIZE);
4117 if (size && size->value.n == -1) {
4118 if (backing_file && backing_file->value.s) {
4119 uint64_t size;
4120 char buf[32];
4121 int back_flags;
4122
4123 /* backing files always opened read-only */
4124 back_flags =
4125 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
4126
4127 bs = bdrv_new("");
4128
4129 ret = bdrv_open(bs, backing_file->value.s, back_flags, backing_drv);
4130 if (ret < 0) {
4131 error_report("Could not open '%s'", backing_file->value.s);
4132 goto out;
4133 }
4134 bdrv_get_geometry(bs, &size);
4135 size *= 512;
4136
4137 snprintf(buf, sizeof(buf), "%" PRId64, size);
4138 set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4139 } else {
4140 error_report("Image creation needs a size parameter");
4141 ret = -EINVAL;
4142 goto out;
4143 }
4144 }
4145
4146 printf("Formatting '%s', fmt=%s ", filename, fmt);
4147 print_option_parameters(param);
4148 puts("");
4149
4150 ret = bdrv_create(drv, filename, param);
4151
4152 if (ret < 0) {
4153 if (ret == -ENOTSUP) {
4154 error_report("Formatting or formatting option not supported for "
4155 "file format '%s'", fmt);
4156 } else if (ret == -EFBIG) {
4157 error_report("The image size is too large for file format '%s'",
4158 fmt);
4159 } else {
4160 error_report("%s: error while creating %s: %s", filename, fmt,
4161 strerror(-ret));
4162 }
4163 }
4164
4165 out:
4166 free_option_parameters(create_options);
4167 free_option_parameters(param);
4168
4169 if (bs) {
4170 bdrv_delete(bs);
4171 }
4172
4173 return ret;
4174 }
4175
4176 void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
4177 int64_t speed, BlockDriverCompletionFunc *cb,
4178 void *opaque, Error **errp)
4179 {
4180 BlockJob *job;
4181
4182 if (bs->job || bdrv_in_use(bs)) {
4183 error_set(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
4184 return NULL;
4185 }
4186 bdrv_set_in_use(bs, 1);
4187
4188 job = g_malloc0(job_type->instance_size);
4189 job->job_type = job_type;
4190 job->bs = bs;
4191 job->cb = cb;
4192 job->opaque = opaque;
4193 bs->job = job;
4194
4195 /* Only set speed when necessary to avoid NotSupported error */
4196 if (speed != 0) {
4197 Error *local_err = NULL;
4198
4199 block_job_set_speed(job, speed, &local_err);
4200 if (error_is_set(&local_err)) {
4201 bs->job = NULL;
4202 g_free(job);
4203 bdrv_set_in_use(bs, 0);
4204 error_propagate(errp, local_err);
4205 return NULL;
4206 }
4207 }
4208 return job;
4209 }
4210
4211 void block_job_complete(BlockJob *job, int ret)
4212 {
4213 BlockDriverState *bs = job->bs;
4214
4215 assert(bs->job == job);
4216 job->cb(job->opaque, ret);
4217 bs->job = NULL;
4218 g_free(job);
4219 bdrv_set_in_use(bs, 0);
4220 }
4221
4222 void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
4223 {
4224 Error *local_err = NULL;
4225
4226 if (!job->job_type->set_speed) {
4227 error_set(errp, QERR_NOT_SUPPORTED);
4228 return;
4229 }
4230 job->job_type->set_speed(job, speed, &local_err);
4231 if (error_is_set(&local_err)) {
4232 error_propagate(errp, local_err);
4233 return;
4234 }
4235
4236 job->speed = speed;
4237 }
4238
4239 void block_job_cancel(BlockJob *job)
4240 {
4241 job->cancelled = true;
4242 }
4243
4244 bool block_job_is_cancelled(BlockJob *job)
4245 {
4246 return job->cancelled;
4247 }
4248
4249 void block_job_cancel_sync(BlockJob *job)
4250 {
4251 BlockDriverState *bs = job->bs;
4252
4253 assert(bs->job == job);
4254 block_job_cancel(job);
4255 while (bs->job != NULL && bs->job->busy) {
4256 qemu_aio_wait();
4257 }
4258 }