]>
Commit | Line | Data |
---|---|---|
fe0a9b74 JR |
1 | /* |
2 | * linux/fs/nfs/blocklayout/blocklayoutdev.c | |
3 | * | |
4 | * Device operations for the pnfs nfs4 file layout driver. | |
5 | * | |
6 | * Copyright (c) 2006 The Regents of the University of Michigan. | |
7 | * All rights reserved. | |
8 | * | |
9 | * Andy Adamson <andros@citi.umich.edu> | |
10 | * Fred Isaman <iisaman@umich.edu> | |
11 | * | |
12 | * permission is granted to use, copy, create derivative works and | |
13 | * redistribute this software and such derivative works for any purpose, | |
14 | * so long as the name of the university of michigan is not used in | |
15 | * any advertising or publicity pertaining to the use or distribution | |
16 | * of this software without specific, written prior authorization. if | |
17 | * the above copyright notice or any other identification of the | |
18 | * university of michigan is included in any copy of any portion of | |
19 | * this software, then the disclaimer below must also be included. | |
20 | * | |
21 | * this software is provided as is, without representation from the | |
22 | * university of michigan as to its fitness for any purpose, and without | |
23 | * warranty by the university of michigan of any kind, either express | |
24 | * or implied, including without limitation the implied warranties of | |
25 | * merchantability and fitness for a particular purpose. the regents | |
26 | * of the university of michigan shall not be liable for any damages, | |
27 | * including special, indirect, incidental, or consequential damages, | |
28 | * with respect to any claim arising out or in connection with the use | |
29 | * of the software, even if it has been or is hereafter advised of the | |
30 | * possibility of such damages. | |
31 | */ | |
32 | #include <linux/module.h> | |
33 | #include <linux/buffer_head.h> /* __bread */ | |
34 | ||
35 | #include <linux/genhd.h> | |
36 | #include <linux/blkdev.h> | |
37 | #include <linux/hash.h> | |
38 | ||
39 | #include "blocklayout.h" | |
40 | ||
41 | #define NFSDBG_FACILITY NFSDBG_PNFS_LD | |
42 | ||
e9437cce FI |
43 | static int decode_sector_number(__be32 **rp, sector_t *sp) |
44 | { | |
45 | uint64_t s; | |
46 | ||
47 | *rp = xdr_decode_hyper(*rp, &s); | |
48 | if (s & 0x1ff) { | |
a030889a | 49 | printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__); |
e9437cce FI |
50 | return -1; |
51 | } | |
52 | *sp = s >> SECTOR_SHIFT; | |
53 | return 0; | |
54 | } | |
55 | ||
fe0a9b74 JR |
56 | /* |
57 | * Release the block device | |
58 | */ | |
4385bab1 | 59 | void nfs4_blkdev_put(struct block_device *bdev) |
fe0a9b74 JR |
60 | { |
61 | dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), | |
62 | MINOR(bdev->bd_dev)); | |
4385bab1 | 63 | blkdev_put(bdev, FMODE_READ); |
fe0a9b74 JR |
64 | } |
65 | ||
fe0a9b74 JR |
66 | ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, |
67 | size_t mlen) | |
68 | { | |
cb9c1c4a SK |
69 | struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info, |
70 | nfs_net_id); | |
71 | ||
fe0a9b74 JR |
72 | if (mlen != sizeof (struct bl_dev_msg)) |
73 | return -EINVAL; | |
74 | ||
cb9c1c4a | 75 | if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0) |
fe0a9b74 JR |
76 | return -EFAULT; |
77 | ||
5ffaf855 | 78 | wake_up(&nn->bl_wq); |
fe0a9b74 JR |
79 | |
80 | return mlen; | |
81 | } | |
82 | ||
83 | void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) | |
84 | { | |
5ffaf855 SK |
85 | struct bl_pipe_msg *bl_pipe_msg = container_of(msg, struct bl_pipe_msg, msg); |
86 | ||
fe0a9b74 JR |
87 | if (msg->errno >= 0) |
88 | return; | |
5ffaf855 | 89 | wake_up(bl_pipe_msg->bl_wq); |
fe0a9b74 JR |
90 | } |
91 | ||
92 | /* | |
93 | * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf. | |
94 | */ | |
95 | struct pnfs_block_dev * | |
96 | nfs4_blk_decode_device(struct nfs_server *server, | |
2f9fd182 | 97 | struct pnfs_device *dev) |
fe0a9b74 | 98 | { |
516f2e24 | 99 | struct pnfs_block_dev *rv; |
fe0a9b74 | 100 | struct block_device *bd = NULL; |
5ffaf855 SK |
101 | struct bl_pipe_msg bl_pipe_msg; |
102 | struct rpc_pipe_msg *msg = &bl_pipe_msg.msg; | |
fe0a9b74 JR |
103 | struct bl_msg_hdr bl_msg = { |
104 | .type = BL_DEVICE_MOUNT, | |
105 | .totallen = dev->mincount, | |
106 | }; | |
107 | uint8_t *dataptr; | |
108 | DECLARE_WAITQUEUE(wq, current); | |
516f2e24 | 109 | int offset, len, i, rc; |
73ea666c | 110 | struct net *net = server->nfs_client->cl_net; |
9e2e74db | 111 | struct nfs_net *nn = net_generic(net, nfs_net_id); |
cb9c1c4a | 112 | struct bl_dev_msg *reply = &nn->bl_mount_reply; |
fe0a9b74 JR |
113 | |
114 | dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); | |
115 | dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, | |
116 | dev->mincount); | |
117 | ||
5ffaf855 SK |
118 | bl_pipe_msg.bl_wq = &nn->bl_wq; |
119 | memset(msg, 0, sizeof(*msg)); | |
120 | msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS); | |
121 | if (!msg->data) { | |
fe0a9b74 JR |
122 | rv = ERR_PTR(-ENOMEM); |
123 | goto out; | |
124 | } | |
125 | ||
5ffaf855 SK |
126 | memcpy(msg->data, &bl_msg, sizeof(bl_msg)); |
127 | dataptr = (uint8_t *) msg->data; | |
2f9fd182 FI |
128 | len = dev->mincount; |
129 | offset = sizeof(bl_msg); | |
130 | for (i = 0; len > 0; i++) { | |
131 | memcpy(&dataptr[offset], page_address(dev->pages[i]), | |
132 | len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE); | |
133 | len -= PAGE_CACHE_SIZE; | |
134 | offset += PAGE_CACHE_SIZE; | |
135 | } | |
5ffaf855 | 136 | msg->len = sizeof(bl_msg) + dev->mincount; |
fe0a9b74 JR |
137 | |
138 | dprintk("%s CALLING USERSPACE DAEMON\n", __func__); | |
5ffaf855 SK |
139 | add_wait_queue(&nn->bl_wq, &wq); |
140 | rc = rpc_queue_upcall(nn->bl_device_pipe, msg); | |
516f2e24 | 141 | if (rc < 0) { |
5ffaf855 | 142 | remove_wait_queue(&nn->bl_wq, &wq); |
516f2e24 | 143 | rv = ERR_PTR(rc); |
fe0a9b74 JR |
144 | goto out; |
145 | } | |
146 | ||
147 | set_current_state(TASK_UNINTERRUPTIBLE); | |
148 | schedule(); | |
149 | __set_current_state(TASK_RUNNING); | |
5ffaf855 | 150 | remove_wait_queue(&nn->bl_wq, &wq); |
fe0a9b74 JR |
151 | |
152 | if (reply->status != BL_DEVICE_REQUEST_PROC) { | |
153 | dprintk("%s failed to open device: %d\n", | |
154 | __func__, reply->status); | |
155 | rv = ERR_PTR(-EINVAL); | |
156 | goto out; | |
157 | } | |
158 | ||
af283885 PT |
159 | bd = blkdev_get_by_dev(MKDEV(reply->major, reply->minor), |
160 | FMODE_READ, NULL); | |
fe0a9b74 | 161 | if (IS_ERR(bd)) { |
af283885 PT |
162 | dprintk("%s failed to open device : %ld\n", __func__, |
163 | PTR_ERR(bd)); | |
164 | rv = ERR_CAST(bd); | |
fe0a9b74 JR |
165 | goto out; |
166 | } | |
167 | ||
168 | rv = kzalloc(sizeof(*rv), GFP_NOFS); | |
169 | if (!rv) { | |
170 | rv = ERR_PTR(-ENOMEM); | |
171 | goto out; | |
172 | } | |
173 | ||
174 | rv->bm_mdev = bd; | |
175 | memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid)); | |
9e2e74db | 176 | rv->net = net; |
fe0a9b74 JR |
177 | dprintk("%s Created device %s with bd_block_size %u\n", |
178 | __func__, | |
179 | bd->bd_disk->disk_name, | |
180 | bd->bd_block_size); | |
181 | ||
182 | out: | |
5ffaf855 | 183 | kfree(msg->data); |
fe0a9b74 JR |
184 | return rv; |
185 | } | |
a60d2ebd | 186 | |
e9437cce FI |
187 | /* Map deviceid returned by the server to constructed block_device */ |
188 | static struct block_device *translate_devid(struct pnfs_layout_hdr *lo, | |
189 | struct nfs4_deviceid *id) | |
190 | { | |
191 | struct block_device *rv = NULL; | |
192 | struct block_mount_id *mid; | |
193 | struct pnfs_block_dev *dev; | |
194 | ||
195 | dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id); | |
196 | mid = BLK_ID(lo); | |
197 | spin_lock(&mid->bm_lock); | |
198 | list_for_each_entry(dev, &mid->bm_devlist, bm_node) { | |
199 | if (memcmp(id->data, dev->bm_mdevid.data, | |
200 | NFS4_DEVICEID4_SIZE) == 0) { | |
201 | rv = dev->bm_mdev; | |
202 | goto out; | |
203 | } | |
204 | } | |
205 | out: | |
206 | spin_unlock(&mid->bm_lock); | |
207 | dprintk("%s returning %p\n", __func__, rv); | |
208 | return rv; | |
209 | } | |
210 | ||
211 | /* Tracks info needed to ensure extents in layout obey constraints of spec */ | |
212 | struct layout_verification { | |
213 | u32 mode; /* R or RW */ | |
214 | u64 start; /* Expected start of next non-COW extent */ | |
215 | u64 inval; /* Start of INVAL coverage */ | |
216 | u64 cowread; /* End of COW read coverage */ | |
217 | }; | |
218 | ||
219 | /* Verify the extent meets the layout requirements of the pnfs-block draft, | |
220 | * section 2.3.1. | |
221 | */ | |
222 | static int verify_extent(struct pnfs_block_extent *be, | |
223 | struct layout_verification *lv) | |
224 | { | |
225 | if (lv->mode == IOMODE_READ) { | |
226 | if (be->be_state == PNFS_BLOCK_READWRITE_DATA || | |
227 | be->be_state == PNFS_BLOCK_INVALID_DATA) | |
228 | return -EIO; | |
229 | if (be->be_f_offset != lv->start) | |
230 | return -EIO; | |
231 | lv->start += be->be_length; | |
232 | return 0; | |
233 | } | |
234 | /* lv->mode == IOMODE_RW */ | |
235 | if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { | |
236 | if (be->be_f_offset != lv->start) | |
237 | return -EIO; | |
238 | if (lv->cowread > lv->start) | |
239 | return -EIO; | |
240 | lv->start += be->be_length; | |
241 | lv->inval = lv->start; | |
242 | return 0; | |
243 | } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { | |
244 | if (be->be_f_offset != lv->start) | |
245 | return -EIO; | |
246 | lv->start += be->be_length; | |
247 | return 0; | |
248 | } else if (be->be_state == PNFS_BLOCK_READ_DATA) { | |
249 | if (be->be_f_offset > lv->start) | |
250 | return -EIO; | |
251 | if (be->be_f_offset < lv->inval) | |
252 | return -EIO; | |
253 | if (be->be_f_offset < lv->cowread) | |
254 | return -EIO; | |
255 | /* It looks like you might want to min this with lv->start, | |
256 | * but you really don't. | |
257 | */ | |
258 | lv->inval = lv->inval + be->be_length; | |
259 | lv->cowread = be->be_f_offset + be->be_length; | |
260 | return 0; | |
261 | } else | |
262 | return -EIO; | |
263 | } | |
264 | ||
265 | /* XDR decode pnfs_block_layout4 structure */ | |
a60d2ebd FI |
266 | int |
267 | nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, | |
268 | struct nfs4_layoutget_res *lgr, gfp_t gfp_flags) | |
269 | { | |
e9437cce FI |
270 | struct pnfs_block_layout *bl = BLK_LO2EXT(lo); |
271 | int i, status = -EIO; | |
272 | uint32_t count; | |
273 | struct pnfs_block_extent *be = NULL, *save; | |
274 | struct xdr_stream stream; | |
275 | struct xdr_buf buf; | |
276 | struct page *scratch; | |
277 | __be32 *p; | |
278 | struct layout_verification lv = { | |
279 | .mode = lgr->range.iomode, | |
280 | .start = lgr->range.offset >> SECTOR_SHIFT, | |
281 | .inval = lgr->range.offset >> SECTOR_SHIFT, | |
282 | .cowread = lgr->range.offset >> SECTOR_SHIFT, | |
283 | }; | |
284 | LIST_HEAD(extents); | |
285 | ||
286 | dprintk("---> %s\n", __func__); | |
287 | ||
288 | scratch = alloc_page(gfp_flags); | |
289 | if (!scratch) | |
290 | return -ENOMEM; | |
291 | ||
292 | xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); | |
293 | xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); | |
294 | ||
295 | p = xdr_inline_decode(&stream, 4); | |
296 | if (unlikely(!p)) | |
297 | goto out_err; | |
298 | ||
299 | count = be32_to_cpup(p++); | |
300 | ||
301 | dprintk("%s enter, number of extents %i\n", __func__, count); | |
302 | p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count); | |
303 | if (unlikely(!p)) | |
304 | goto out_err; | |
305 | ||
306 | /* Decode individual extents, putting them in temporary | |
307 | * staging area until whole layout is decoded to make error | |
308 | * recovery easier. | |
309 | */ | |
310 | for (i = 0; i < count; i++) { | |
311 | be = bl_alloc_extent(); | |
312 | if (!be) { | |
313 | status = -ENOMEM; | |
314 | goto out_err; | |
315 | } | |
316 | memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE); | |
317 | p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); | |
318 | be->be_mdev = translate_devid(lo, &be->be_devid); | |
319 | if (!be->be_mdev) | |
320 | goto out_err; | |
321 | ||
322 | /* The next three values are read in as bytes, | |
323 | * but stored as 512-byte sector lengths | |
324 | */ | |
325 | if (decode_sector_number(&p, &be->be_f_offset) < 0) | |
326 | goto out_err; | |
327 | if (decode_sector_number(&p, &be->be_length) < 0) | |
328 | goto out_err; | |
329 | if (decode_sector_number(&p, &be->be_v_offset) < 0) | |
330 | goto out_err; | |
331 | be->be_state = be32_to_cpup(p++); | |
332 | if (be->be_state == PNFS_BLOCK_INVALID_DATA) | |
333 | be->be_inval = &bl->bl_inval; | |
334 | if (verify_extent(be, &lv)) { | |
335 | dprintk("%s verify failed\n", __func__); | |
336 | goto out_err; | |
337 | } | |
338 | list_add_tail(&be->be_node, &extents); | |
339 | } | |
340 | if (lgr->range.offset + lgr->range.length != | |
341 | lv.start << SECTOR_SHIFT) { | |
342 | dprintk("%s Final length mismatch\n", __func__); | |
343 | be = NULL; | |
344 | goto out_err; | |
345 | } | |
346 | if (lv.start < lv.cowread) { | |
347 | dprintk("%s Final uncovered COW extent\n", __func__); | |
348 | be = NULL; | |
349 | goto out_err; | |
350 | } | |
351 | /* Extents decoded properly, now try to merge them in to | |
352 | * existing layout extents. | |
353 | */ | |
354 | spin_lock(&bl->bl_ext_lock); | |
355 | list_for_each_entry_safe(be, save, &extents, be_node) { | |
356 | list_del(&be->be_node); | |
357 | status = bl_add_merge_extent(bl, be); | |
358 | if (status) { | |
359 | spin_unlock(&bl->bl_ext_lock); | |
360 | /* This is a fairly catastrophic error, as the | |
361 | * entire layout extent lists are now corrupted. | |
362 | * We should have some way to distinguish this. | |
363 | */ | |
364 | be = NULL; | |
365 | goto out_err; | |
366 | } | |
367 | } | |
368 | spin_unlock(&bl->bl_ext_lock); | |
369 | status = 0; | |
370 | out: | |
371 | __free_page(scratch); | |
372 | dprintk("%s returns %i\n", __func__, status); | |
373 | return status; | |
374 | ||
375 | out_err: | |
376 | bl_put_extent(be); | |
377 | while (!list_empty(&extents)) { | |
378 | be = list_first_entry(&extents, struct pnfs_block_extent, | |
379 | be_node); | |
380 | list_del(&be->be_node); | |
381 | bl_put_extent(be); | |
382 | } | |
383 | goto out; | |
a60d2ebd | 384 | } |