]>
Commit | Line | Data |
---|---|---|
3db00caa FG |
1 | From 9b9b09f452a469458451c221debfbab944e7f081 Mon Sep 17 00:00:00 2001 |
2 | From: Rob N <robn@despairlabs.com> | |
3 | Date: Wed, 29 Nov 2023 04:15:48 +1100 | |
4 | Subject: [PATCH] dnode_is_dirty: check dnode and its data for dirtiness | |
5 | MIME-Version: 1.0 | |
6 | Content-Type: text/plain; charset=UTF-8 | |
7 | Content-Transfer-Encoding: 8bit | |
8 | ||
9 | Over its history this the dirty dnode test has been changed between | |
10 | checking for a dnodes being on `os_dirty_dnodes` (`dn_dirty_link`) and | |
11 | `dn_dirty_record`. | |
12 | ||
13 | de198f2d9 Fix lseek(SEEK_DATA/SEEK_HOLE) mmap consistency | |
14 | 2531ce372 Revert "Report holes when there are only metadata changes" | |
15 | ec4f9b8f3 Report holes when there are only metadata changes | |
16 | 454365bba Fix dirty check in dmu_offset_next() | |
17 | 66aca2473 SEEK_HOLE should not block on txg_wait_synced() | |
18 | ||
19 | Also illumos/illumos-gate@c543ec060d illumos/illumos-gate@2bcf0248e9 | |
20 | ||
21 | It turns out both are actually required. | |
22 | ||
23 | In the case of appending data to a newly created file, the dnode proper | |
24 | is dirtied (at least to change the blocksize) and dirty records are | |
25 | added. Thus, a single logical operation is represented by separate | |
26 | dirty indicators, and must not be separated. | |
27 | ||
28 | The incorrect dirty check becomes a problem when the first block of a | |
29 | file is being appended to while another process is calling lseek to skip | |
30 | holes. There is a small window where the dnode part is undirtied while | |
31 | there are still dirty records. In this case, `lseek(fd, 0, SEEK_DATA)` | |
32 | would not know that the file is dirty, and would go to | |
33 | `dnode_next_offset()`. Since the object has no data blocks yet, it | |
34 | returns `ESRCH`, indicating no data found, which results in `ENXIO` | |
35 | being returned to `lseek()`'s caller. | |
36 | ||
37 | Since coreutils 9.2, `cp` performs sparse copies by default, that is, it | |
38 | uses `SEEK_DATA` and `SEEK_HOLE` against the source file and attempts to | |
39 | replicate the holes in the target. When it hits the bug, its initial | |
40 | search for data fails, and it goes on to call `fallocate()` to create a | |
41 | hole over the entire destination file. | |
42 | ||
43 | This has come up more recently as users upgrade their systems, getting | |
44 | OpenZFS 2.2 as well as a newer coreutils. However, this problem has been | |
45 | reproduced against 2.1, as well as on FreeBSD 13 and 14. | |
46 | ||
47 | This change simply updates the dirty check to check both types of dirty. | |
48 | If there's anything dirty at all, we immediately go to the "wait for | |
49 | sync" stage, It doesn't really matter after that; both changes are on | |
50 | disk, so the dirty fields should be correct. | |
51 | ||
52 | Sponsored-by: Klara, Inc. | |
53 | Sponsored-by: Wasabi Technology, Inc. | |
54 | Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> | |
55 | Reviewed-by: Alexander Motin <mav@FreeBSD.org> | |
56 | Reviewed-by: Rich Ercolani <rincebrain@gmail.com> | |
57 | Signed-off-by: Rob Norris <rob.norris@klarasystems.com> | |
58 | Closes #15571 | |
59 | Closes #15526 | |
60 | Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com> | |
61 | --- | |
62 | module/zfs/dnode.c | 12 ++++++++++-- | |
63 | 1 file changed, 10 insertions(+), 2 deletions(-) | |
64 | ||
65 | diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c | |
66 | index 7cf03264d..ad9988366 100644 | |
67 | --- a/module/zfs/dnode.c | |
68 | +++ b/module/zfs/dnode.c | |
69 | @@ -1764,7 +1764,14 @@ dnode_try_claim(objset_t *os, uint64_t object, int slots) | |
70 | } | |
71 | ||
72 | /* | |
73 | - * Checks if the dnode contains any uncommitted dirty records. | |
74 | + * Checks if the dnode itself is dirty, or is carrying any uncommitted records. | |
75 | + * It is important to check both conditions, as some operations (eg appending | |
76 | + * to a file) can dirty both as a single logical unit, but they are not synced | |
77 | + * out atomically, so checking one and not the other can result in an object | |
78 | + * appearing to be clean mid-way through a commit. | |
79 | + * | |
80 | + * Do not change this lightly! If you get it wrong, dmu_offset_next() can | |
81 | + * detect a hole where there is really data, leading to silent corruption. | |
82 | */ | |
83 | boolean_t | |
84 | dnode_is_dirty(dnode_t *dn) | |
85 | @@ -1772,7 +1779,8 @@ dnode_is_dirty(dnode_t *dn) | |
86 | mutex_enter(&dn->dn_mtx); | |
87 | ||
88 | for (int i = 0; i < TXG_SIZE; i++) { | |
89 | - if (multilist_link_active(&dn->dn_dirty_link[i])) { | |
90 | + if (multilist_link_active(&dn->dn_dirty_link[i]) || | |
91 | + !list_is_empty(&dn->dn_dirty_records[i])) { | |
92 | mutex_exit(&dn->dn_mtx); | |
93 | return (B_TRUE); | |
94 | } | |
95 | -- | |
96 | 2.39.2 | |
97 |