]>
Commit | Line | Data |
---|---|---|
306b0c95 NG |
1 | /* |
2 | * Compressed RAM based swap device | |
3 | * | |
4 | * Copyright (C) 2008, 2009 Nitin Gupta | |
5 | * | |
6 | * This code is released using a dual license strategy: BSD/GPL | |
7 | * You can choose the licence that better fits your requirements. | |
8 | * | |
9 | * Released under the terms of 3-clause BSD License | |
10 | * Released under the terms of GNU General Public License Version 2.0 | |
11 | * | |
12 | * Project home: http://compcache.googlecode.com | |
13 | */ | |
14 | ||
15 | #define KMSG_COMPONENT "ramzswap" | |
16 | #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt | |
17 | ||
18 | #include <linux/module.h> | |
19 | #include <linux/kernel.h> | |
20 | #include <linux/bitops.h> | |
21 | #include <linux/blkdev.h> | |
22 | #include <linux/buffer_head.h> | |
23 | #include <linux/device.h> | |
24 | #include <linux/genhd.h> | |
25 | #include <linux/highmem.h> | |
26 | #include <linux/lzo.h> | |
306b0c95 NG |
27 | #include <linux/string.h> |
28 | #include <linux/swap.h> | |
29 | #include <linux/swapops.h> | |
30 | #include <linux/vmalloc.h> | |
306b0c95 NG |
31 | |
32 | #include "ramzswap_drv.h" | |
33 | ||
34 | /* Globals */ | |
35 | static int ramzswap_major; | |
36 | static struct ramzswap *devices; | |
37 | ||
38 | /* | |
39 | * Pages that compress to larger than this size are | |
40 | * forwarded to backing swap, if present or stored | |
41 | * uncompressed in memory otherwise. | |
42 | */ | |
43 | static unsigned int max_zpage_size; | |
44 | ||
45 | /* Module params (documentation at end) */ | |
46 | static unsigned int num_devices; | |
47 | ||
48 | static int rzs_test_flag(struct ramzswap *rzs, u32 index, | |
49 | enum rzs_pageflags flag) | |
50 | { | |
51 | return rzs->table[index].flags & BIT(flag); | |
52 | } | |
53 | ||
54 | static void rzs_set_flag(struct ramzswap *rzs, u32 index, | |
55 | enum rzs_pageflags flag) | |
56 | { | |
57 | rzs->table[index].flags |= BIT(flag); | |
58 | } | |
59 | ||
60 | static void rzs_clear_flag(struct ramzswap *rzs, u32 index, | |
61 | enum rzs_pageflags flag) | |
62 | { | |
63 | rzs->table[index].flags &= ~BIT(flag); | |
64 | } | |
65 | ||
66 | static int page_zero_filled(void *ptr) | |
67 | { | |
68 | unsigned int pos; | |
69 | unsigned long *page; | |
70 | ||
71 | page = (unsigned long *)ptr; | |
72 | ||
73 | for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) { | |
74 | if (page[pos]) | |
75 | return 0; | |
76 | } | |
77 | ||
78 | return 1; | |
79 | } | |
80 | ||
81 | /* | |
82 | * memlimit cannot be greater than backing disk size. | |
83 | */ | |
84 | static void ramzswap_set_memlimit(struct ramzswap *rzs, size_t totalram_bytes) | |
85 | { | |
86 | int memlimit_valid = 1; | |
87 | ||
88 | if (!rzs->memlimit) { | |
89 | pr_info("Memory limit not set.\n"); | |
90 | memlimit_valid = 0; | |
91 | } | |
92 | ||
93 | if (rzs->memlimit > rzs->disksize) { | |
94 | pr_info("Memory limit cannot be greater than " | |
95 | "disksize: limit=%zu, disksize=%zu\n", | |
96 | rzs->memlimit, rzs->disksize); | |
97 | memlimit_valid = 0; | |
98 | } | |
99 | ||
100 | if (!memlimit_valid) { | |
101 | size_t mempart, disksize; | |
102 | pr_info("Using default: smaller of (%u%% of RAM) and " | |
103 | "(backing disk size).\n", | |
104 | default_memlimit_perc_ram); | |
105 | mempart = default_memlimit_perc_ram * (totalram_bytes / 100); | |
106 | disksize = rzs->disksize; | |
107 | rzs->memlimit = mempart > disksize ? disksize : mempart; | |
108 | } | |
109 | ||
110 | if (rzs->memlimit > totalram_bytes / 2) { | |
111 | pr_info( | |
112 | "Its not advisable setting limit more than half of " | |
113 | "size of memory since we expect a 2:1 compression ratio. " | |
114 | "Limit represents amount of *compressed* data we can keep " | |
115 | "in memory!\n" | |
116 | "\tMemory Size: %zu kB\n" | |
117 | "\tLimit you selected: %zu kB\n" | |
118 | "Continuing anyway ...\n", | |
119 | totalram_bytes >> 10, rzs->memlimit >> 10 | |
120 | ); | |
121 | } | |
122 | ||
123 | rzs->memlimit &= PAGE_MASK; | |
124 | BUG_ON(!rzs->memlimit); | |
125 | } | |
126 | ||
127 | static void ramzswap_set_disksize(struct ramzswap *rzs, size_t totalram_bytes) | |
128 | { | |
129 | if (!rzs->disksize) { | |
130 | pr_info( | |
131 | "disk size not provided. You can use disksize_kb module " | |
132 | "param to specify size.\nUsing default: (%u%% of RAM).\n", | |
133 | default_disksize_perc_ram | |
134 | ); | |
135 | rzs->disksize = default_disksize_perc_ram * | |
136 | (totalram_bytes / 100); | |
137 | } | |
138 | ||
139 | if (rzs->disksize > 2 * (totalram_bytes)) { | |
140 | pr_info( | |
141 | "There is little point creating a ramzswap of greater than " | |
142 | "twice the size of memory since we expect a 2:1 compression " | |
143 | "ratio. Note that ramzswap uses about 0.1%% of the size of " | |
144 | "the swap device when not in use so a huge ramzswap is " | |
145 | "wasteful.\n" | |
146 | "\tMemory Size: %zu kB\n" | |
147 | "\tSize you selected: %zu kB\n" | |
148 | "Continuing anyway ...\n", | |
149 | totalram_bytes >> 10, rzs->disksize | |
150 | ); | |
151 | } | |
152 | ||
153 | rzs->disksize &= PAGE_MASK; | |
154 | } | |
155 | ||
156 | /* | |
157 | * Swap header (1st page of swap device) contains information | |
158 | * to indentify it as a swap partition. Prepare such a header | |
159 | * for ramzswap device (ramzswap0) so that swapon can identify | |
160 | * it as swap partition. In case backing swap device is provided, | |
161 | * copy its swap header. | |
162 | */ | |
163 | static int setup_swap_header(struct ramzswap *rzs, union swap_header *s) | |
164 | { | |
165 | int ret = 0; | |
166 | struct page *page; | |
167 | struct address_space *mapping; | |
168 | union swap_header *backing_swap_header; | |
169 | ||
170 | /* | |
171 | * There is no backing swap device. Create a swap header | |
172 | * that is acceptable by swapon. | |
173 | */ | |
174 | if (!rzs->backing_swap) { | |
175 | s->info.version = 1; | |
176 | s->info.last_page = (rzs->disksize >> PAGE_SHIFT) - 1; | |
177 | s->info.nr_badpages = 0; | |
178 | memcpy(s->magic.magic, "SWAPSPACE2", 10); | |
179 | return 0; | |
180 | } | |
181 | ||
182 | /* | |
183 | * We have a backing swap device. Copy its swap header | |
184 | * to ramzswap device header. If this header contains | |
185 | * invalid information (backing device not a swap | |
186 | * partition, etc.), swapon will fail for ramzswap | |
187 | * which is correct behavior - we don't want to swap | |
188 | * over filesystem partition! | |
189 | */ | |
190 | ||
191 | /* Read the backing swap header (code from sys_swapon) */ | |
192 | mapping = rzs->swap_file->f_mapping; | |
193 | if (!mapping->a_ops->readpage) { | |
194 | ret = -EINVAL; | |
195 | goto out; | |
196 | } | |
197 | ||
198 | page = read_mapping_page(mapping, 0, rzs->swap_file); | |
199 | if (IS_ERR(page)) { | |
200 | ret = PTR_ERR(page); | |
201 | goto out; | |
202 | } | |
203 | ||
204 | backing_swap_header = kmap(page); | |
205 | memcpy(s, backing_swap_header, sizeof(*s)); | |
206 | if (s->info.nr_badpages) { | |
207 | pr_info("Cannot use backing swap with bad pages (%u)\n", | |
208 | s->info.nr_badpages); | |
209 | ret = -EINVAL; | |
210 | } | |
211 | /* | |
212 | * ramzswap disksize equals number of usable pages in backing | |
213 | * swap. Set last_page in swap header to match this disksize | |
214 | * ('last_page' means 0-based index of last usable swap page). | |
215 | */ | |
216 | s->info.last_page = (rzs->disksize >> PAGE_SHIFT) - 1; | |
217 | kunmap(page); | |
218 | ||
219 | out: | |
220 | return ret; | |
221 | } | |
222 | ||
306b0c95 NG |
223 | void ramzswap_ioctl_get_stats(struct ramzswap *rzs, |
224 | struct ramzswap_ioctl_stats *s) | |
225 | { | |
226 | strncpy(s->backing_swap_name, rzs->backing_swap_name, | |
227 | MAX_SWAP_NAME_LEN - 1); | |
228 | s->backing_swap_name[MAX_SWAP_NAME_LEN - 1] = '\0'; | |
229 | ||
230 | s->disksize = rzs->disksize; | |
231 | s->memlimit = rzs->memlimit; | |
232 | ||
233 | #if defined(CONFIG_RAMZSWAP_STATS) | |
234 | { | |
235 | struct ramzswap_stats *rs = &rzs->stats; | |
236 | size_t succ_writes, mem_used; | |
237 | unsigned int good_compress_perc = 0, no_compress_perc = 0; | |
238 | ||
239 | mem_used = xv_get_total_size_bytes(rzs->mem_pool) | |
240 | + (rs->pages_expand << PAGE_SHIFT); | |
6a907728 NG |
241 | succ_writes = rzs_stat64_read(rzs, &rs->num_writes) - |
242 | rzs_stat64_read(rzs, &rs->failed_writes); | |
306b0c95 NG |
243 | |
244 | if (succ_writes && rs->pages_stored) { | |
245 | good_compress_perc = rs->good_compress * 100 | |
246 | / rs->pages_stored; | |
247 | no_compress_perc = rs->pages_expand * 100 | |
248 | / rs->pages_stored; | |
249 | } | |
250 | ||
6a907728 NG |
251 | s->num_reads = rzs_stat64_read(rzs, &rs->num_reads); |
252 | s->num_writes = rzs_stat64_read(rzs, &rs->num_writes); | |
253 | s->failed_reads = rzs_stat64_read(rzs, &rs->failed_reads); | |
254 | s->failed_writes = rzs_stat64_read(rzs, &rs->failed_writes); | |
255 | s->invalid_io = rzs_stat64_read(rzs, &rs->invalid_io); | |
256 | s->notify_free = rzs_stat64_read(rzs, &rs->notify_free); | |
306b0c95 NG |
257 | s->pages_zero = rs->pages_zero; |
258 | ||
259 | s->good_compress_pct = good_compress_perc; | |
260 | s->pages_expand_pct = no_compress_perc; | |
261 | ||
262 | s->pages_stored = rs->pages_stored; | |
263 | s->pages_used = mem_used >> PAGE_SHIFT; | |
264 | s->orig_data_size = rs->pages_stored << PAGE_SHIFT; | |
265 | s->compr_data_size = rs->compr_size; | |
266 | s->mem_used_total = mem_used; | |
267 | ||
6a907728 NG |
268 | s->bdev_num_reads = rzs_stat64_read(rzs, &rs->bdev_num_reads); |
269 | s->bdev_num_writes = rzs_stat64_read(rzs, &rs->bdev_num_writes); | |
306b0c95 NG |
270 | } |
271 | #endif /* CONFIG_RAMZSWAP_STATS */ | |
272 | } | |
273 | ||
274 | static int add_backing_swap_extent(struct ramzswap *rzs, | |
275 | pgoff_t phy_pagenum, | |
276 | pgoff_t num_pages) | |
277 | { | |
278 | unsigned int idx; | |
279 | struct list_head *head; | |
280 | struct page *curr_page, *new_page; | |
281 | unsigned int extents_per_page = PAGE_SIZE / | |
282 | sizeof(struct ramzswap_backing_extent); | |
283 | ||
284 | idx = rzs->num_extents % extents_per_page; | |
285 | if (!idx) { | |
286 | new_page = alloc_page(__GFP_ZERO); | |
287 | if (!new_page) | |
288 | return -ENOMEM; | |
289 | ||
290 | if (rzs->num_extents) { | |
291 | curr_page = virt_to_page(rzs->curr_extent); | |
292 | head = &curr_page->lru; | |
293 | } else { | |
294 | head = &rzs->backing_swap_extent_list; | |
295 | } | |
296 | ||
297 | list_add(&new_page->lru, head); | |
298 | rzs->curr_extent = page_address(new_page); | |
299 | } | |
300 | ||
301 | rzs->curr_extent->phy_pagenum = phy_pagenum; | |
302 | rzs->curr_extent->num_pages = num_pages; | |
303 | ||
304 | pr_debug("add_extent: idx=%u, phy_pgnum=%lu, num_pgs=%lu, " | |
305 | "pg_last=%lu, curr_ext=%p\n", idx, phy_pagenum, num_pages, | |
306 | phy_pagenum + num_pages - 1, rzs->curr_extent); | |
307 | ||
308 | if (idx != extents_per_page - 1) | |
309 | rzs->curr_extent++; | |
310 | ||
311 | return 0; | |
312 | } | |
313 | ||
314 | static int setup_backing_swap_extents(struct ramzswap *rzs, | |
315 | struct inode *inode, unsigned long *num_pages) | |
316 | { | |
317 | int ret = 0; | |
318 | unsigned blkbits; | |
319 | unsigned blocks_per_page; | |
320 | pgoff_t contig_pages = 0, total_pages = 0; | |
321 | pgoff_t pagenum = 0, prev_pagenum = 0; | |
322 | sector_t probe_block = 0; | |
323 | sector_t last_block; | |
324 | ||
325 | blkbits = inode->i_blkbits; | |
326 | blocks_per_page = PAGE_SIZE >> blkbits; | |
327 | ||
328 | last_block = i_size_read(inode) >> blkbits; | |
329 | while (probe_block + blocks_per_page <= last_block) { | |
330 | unsigned block_in_page; | |
331 | sector_t first_block; | |
332 | ||
333 | first_block = bmap(inode, probe_block); | |
334 | if (first_block == 0) | |
335 | goto bad_bmap; | |
336 | ||
337 | /* It must be PAGE_SIZE aligned on-disk */ | |
338 | if (first_block & (blocks_per_page - 1)) { | |
339 | probe_block++; | |
340 | goto probe_next; | |
341 | } | |
342 | ||
343 | /* All blocks within this page must be contiguous on disk */ | |
344 | for (block_in_page = 1; block_in_page < blocks_per_page; | |
345 | block_in_page++) { | |
346 | sector_t block; | |
347 | ||
348 | block = bmap(inode, probe_block + block_in_page); | |
349 | if (block == 0) | |
350 | goto bad_bmap; | |
351 | if (block != first_block + block_in_page) { | |
352 | /* Discontiguity */ | |
353 | probe_block++; | |
354 | goto probe_next; | |
355 | } | |
356 | } | |
357 | ||
358 | /* | |
359 | * We found a PAGE_SIZE length, PAGE_SIZE aligned | |
360 | * run of blocks. | |
361 | */ | |
362 | pagenum = first_block >> (PAGE_SHIFT - blkbits); | |
363 | ||
364 | if (total_pages && (pagenum != prev_pagenum + 1)) { | |
365 | ret = add_backing_swap_extent(rzs, prev_pagenum - | |
366 | (contig_pages - 1), contig_pages); | |
367 | if (ret < 0) | |
368 | goto out; | |
369 | rzs->num_extents++; | |
370 | contig_pages = 0; | |
371 | } | |
372 | total_pages++; | |
373 | contig_pages++; | |
374 | prev_pagenum = pagenum; | |
375 | probe_block += blocks_per_page; | |
376 | ||
377 | probe_next: | |
378 | continue; | |
379 | } | |
380 | ||
381 | if (contig_pages) { | |
382 | pr_debug("adding last extent: pagenum=%lu, " | |
383 | "contig_pages=%lu\n", pagenum, contig_pages); | |
384 | ret = add_backing_swap_extent(rzs, | |
385 | prev_pagenum - (contig_pages - 1), contig_pages); | |
386 | if (ret < 0) | |
387 | goto out; | |
388 | rzs->num_extents++; | |
389 | } | |
390 | if (!rzs->num_extents) { | |
391 | pr_err("No swap extents found!\n"); | |
392 | ret = -EINVAL; | |
393 | } | |
394 | ||
395 | if (!ret) { | |
396 | *num_pages = total_pages; | |
397 | pr_info("Found %lu extents containing %luk\n", | |
398 | rzs->num_extents, *num_pages << (PAGE_SHIFT - 10)); | |
399 | } | |
400 | goto out; | |
401 | ||
402 | bad_bmap: | |
403 | pr_err("Backing swapfile has holes\n"); | |
404 | ret = -EINVAL; | |
405 | out: | |
406 | while (ret && !list_empty(&rzs->backing_swap_extent_list)) { | |
407 | struct page *page; | |
408 | struct list_head *entry = rzs->backing_swap_extent_list.next; | |
409 | page = list_entry(entry, struct page, lru); | |
410 | list_del(entry); | |
411 | __free_page(page); | |
412 | } | |
413 | return ret; | |
414 | } | |
415 | ||
416 | static void map_backing_swap_extents(struct ramzswap *rzs) | |
417 | { | |
418 | struct ramzswap_backing_extent *se; | |
419 | struct page *table_page, *se_page; | |
420 | unsigned long num_pages, num_table_pages, entry; | |
421 | unsigned long se_idx, span; | |
422 | unsigned entries_per_page = PAGE_SIZE / sizeof(*rzs->table); | |
423 | unsigned extents_per_page = PAGE_SIZE / sizeof(*se); | |
424 | ||
425 | /* True for block device */ | |
426 | if (!rzs->num_extents) | |
427 | return; | |
428 | ||
429 | se_page = list_entry(rzs->backing_swap_extent_list.next, | |
430 | struct page, lru); | |
431 | se = page_address(se_page); | |
432 | span = se->num_pages; | |
433 | num_pages = rzs->disksize >> PAGE_SHIFT; | |
434 | num_table_pages = DIV_ROUND_UP(num_pages * sizeof(*rzs->table), | |
435 | PAGE_SIZE); | |
436 | ||
437 | entry = 0; | |
438 | se_idx = 0; | |
439 | while (num_table_pages--) { | |
440 | table_page = vmalloc_to_page(&rzs->table[entry]); | |
441 | while (span <= entry) { | |
442 | se_idx++; | |
443 | if (se_idx == rzs->num_extents) | |
444 | BUG(); | |
445 | ||
446 | if (!(se_idx % extents_per_page)) { | |
447 | se_page = list_entry(se_page->lru.next, | |
448 | struct page, lru); | |
449 | se = page_address(se_page); | |
450 | } else | |
451 | se++; | |
452 | ||
453 | span += se->num_pages; | |
454 | } | |
455 | table_page->mapping = (struct address_space *)se; | |
456 | table_page->private = se->num_pages - (span - entry); | |
457 | pr_debug("map_table: entry=%lu, span=%lu, map=%p, priv=%lu\n", | |
458 | entry, span, table_page->mapping, table_page->private); | |
459 | entry += entries_per_page; | |
460 | } | |
461 | } | |
462 | ||
463 | /* | |
464 | * Check if value of backing_swap module param is sane. | |
465 | * Claim this device and set ramzswap size equal to | |
466 | * size of this block device. | |
467 | */ | |
468 | static int setup_backing_swap(struct ramzswap *rzs) | |
469 | { | |
470 | int ret = 0; | |
471 | size_t disksize; | |
472 | unsigned long num_pages = 0; | |
473 | struct inode *inode; | |
474 | struct file *swap_file; | |
475 | struct address_space *mapping; | |
476 | struct block_device *bdev = NULL; | |
477 | ||
478 | if (!rzs->backing_swap_name[0]) { | |
479 | pr_debug("backing_swap param not given\n"); | |
480 | goto out; | |
481 | } | |
482 | ||
483 | pr_info("Using backing swap device: %s\n", rzs->backing_swap_name); | |
484 | ||
485 | swap_file = filp_open(rzs->backing_swap_name, | |
486 | O_RDWR | O_LARGEFILE, 0); | |
487 | if (IS_ERR(swap_file)) { | |
488 | pr_err("Error opening backing device: %s\n", | |
489 | rzs->backing_swap_name); | |
490 | ret = -EINVAL; | |
491 | goto out; | |
492 | } | |
493 | ||
494 | mapping = swap_file->f_mapping; | |
495 | inode = mapping->host; | |
496 | ||
497 | if (S_ISBLK(inode->i_mode)) { | |
498 | bdev = I_BDEV(inode); | |
499 | ret = bd_claim(bdev, setup_backing_swap); | |
500 | if (ret < 0) { | |
501 | bdev = NULL; | |
502 | goto bad_param; | |
503 | } | |
504 | disksize = i_size_read(inode); | |
505 | } else if (S_ISREG(inode->i_mode)) { | |
506 | bdev = inode->i_sb->s_bdev; | |
507 | if (IS_SWAPFILE(inode)) { | |
508 | ret = -EBUSY; | |
509 | goto bad_param; | |
510 | } | |
511 | ret = setup_backing_swap_extents(rzs, inode, &num_pages); | |
512 | if (ret < 0) | |
513 | goto bad_param; | |
514 | disksize = num_pages << PAGE_SHIFT; | |
515 | } else { | |
516 | goto bad_param; | |
517 | } | |
518 | ||
519 | rzs->swap_file = swap_file; | |
520 | rzs->backing_swap = bdev; | |
521 | rzs->disksize = disksize; | |
522 | BUG_ON(!rzs->disksize); | |
523 | ||
524 | return 0; | |
525 | ||
526 | bad_param: | |
527 | if (bdev) | |
528 | bd_release(bdev); | |
529 | filp_close(swap_file, NULL); | |
530 | ||
531 | out: | |
532 | rzs->backing_swap = NULL; | |
533 | return ret; | |
534 | } | |
535 | ||
536 | /* | |
537 | * Map logical page number 'pagenum' to physical page number | |
538 | * on backing swap device. For block device, this is a nop. | |
539 | */ | |
540 | u32 map_backing_swap_page(struct ramzswap *rzs, u32 pagenum) | |
541 | { | |
542 | u32 skip_pages, entries_per_page; | |
543 | size_t delta, se_offset, skipped; | |
544 | struct page *table_page, *se_page; | |
545 | struct ramzswap_backing_extent *se; | |
546 | ||
547 | if (!rzs->num_extents) | |
548 | return pagenum; | |
549 | ||
550 | entries_per_page = PAGE_SIZE / sizeof(*rzs->table); | |
551 | ||
552 | table_page = vmalloc_to_page(&rzs->table[pagenum]); | |
553 | se = (struct ramzswap_backing_extent *)table_page->mapping; | |
554 | se_page = virt_to_page(se); | |
555 | ||
556 | skip_pages = pagenum - (pagenum / entries_per_page * entries_per_page); | |
557 | se_offset = table_page->private + skip_pages; | |
558 | ||
559 | if (se_offset < se->num_pages) | |
560 | return se->phy_pagenum + se_offset; | |
561 | ||
562 | skipped = se->num_pages - table_page->private; | |
563 | do { | |
564 | struct ramzswap_backing_extent *se_base; | |
565 | u32 se_entries_per_page = PAGE_SIZE / sizeof(*se); | |
566 | ||
567 | /* Get next swap extent */ | |
568 | se_base = (struct ramzswap_backing_extent *) | |
569 | page_address(se_page); | |
570 | if (se - se_base == se_entries_per_page - 1) { | |
571 | se_page = list_entry(se_page->lru.next, | |
572 | struct page, lru); | |
573 | se = page_address(se_page); | |
574 | } else { | |
575 | se++; | |
576 | } | |
577 | ||
578 | skipped += se->num_pages; | |
579 | } while (skipped < skip_pages); | |
580 | ||
581 | delta = skipped - skip_pages; | |
582 | se_offset = se->num_pages - delta; | |
583 | ||
584 | return se->phy_pagenum + se_offset; | |
585 | } | |
586 | ||
587 | static void ramzswap_free_page(struct ramzswap *rzs, size_t index) | |
588 | { | |
589 | u32 clen; | |
590 | void *obj; | |
591 | ||
592 | struct page *page = rzs->table[index].page; | |
593 | u32 offset = rzs->table[index].offset; | |
594 | ||
595 | if (unlikely(!page)) { | |
596 | if (rzs_test_flag(rzs, index, RZS_ZERO)) { | |
597 | rzs_clear_flag(rzs, index, RZS_ZERO); | |
6a907728 | 598 | rzs_stat_dec(&rzs->stats.pages_zero); |
306b0c95 NG |
599 | } |
600 | return; | |
601 | } | |
602 | ||
603 | if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED))) { | |
604 | clen = PAGE_SIZE; | |
605 | __free_page(page); | |
606 | rzs_clear_flag(rzs, index, RZS_UNCOMPRESSED); | |
6a907728 | 607 | rzs_stat_dec(&rzs->stats.pages_expand); |
306b0c95 NG |
608 | goto out; |
609 | } | |
610 | ||
611 | obj = kmap_atomic(page, KM_USER0) + offset; | |
612 | clen = xv_get_object_size(obj) - sizeof(struct zobj_header); | |
613 | kunmap_atomic(obj, KM_USER0); | |
614 | ||
615 | xv_free(rzs->mem_pool, page, offset); | |
616 | if (clen <= PAGE_SIZE / 2) | |
6a907728 | 617 | rzs_stat_dec(&rzs->stats.good_compress); |
306b0c95 NG |
618 | |
619 | out: | |
620 | rzs->stats.compr_size -= clen; | |
6a907728 | 621 | rzs_stat_dec(&rzs->stats.pages_stored); |
306b0c95 NG |
622 | |
623 | rzs->table[index].page = NULL; | |
624 | rzs->table[index].offset = 0; | |
625 | } | |
626 | ||
627 | static int handle_zero_page(struct bio *bio) | |
628 | { | |
629 | void *user_mem; | |
630 | struct page *page = bio->bi_io_vec[0].bv_page; | |
631 | ||
632 | user_mem = kmap_atomic(page, KM_USER0); | |
633 | memset(user_mem, 0, PAGE_SIZE); | |
634 | kunmap_atomic(user_mem, KM_USER0); | |
635 | ||
30fb8a71 | 636 | flush_dcache_page(page); |
306b0c95 NG |
637 | |
638 | set_bit(BIO_UPTODATE, &bio->bi_flags); | |
639 | bio_endio(bio, 0); | |
640 | return 0; | |
641 | } | |
642 | ||
643 | static int handle_uncompressed_page(struct ramzswap *rzs, struct bio *bio) | |
644 | { | |
645 | u32 index; | |
646 | struct page *page; | |
647 | unsigned char *user_mem, *cmem; | |
648 | ||
649 | page = bio->bi_io_vec[0].bv_page; | |
650 | index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT; | |
651 | ||
652 | user_mem = kmap_atomic(page, KM_USER0); | |
653 | cmem = kmap_atomic(rzs->table[index].page, KM_USER1) + | |
654 | rzs->table[index].offset; | |
655 | ||
656 | memcpy(user_mem, cmem, PAGE_SIZE); | |
657 | kunmap_atomic(user_mem, KM_USER0); | |
658 | kunmap_atomic(cmem, KM_USER1); | |
659 | ||
30fb8a71 | 660 | flush_dcache_page(page); |
306b0c95 NG |
661 | |
662 | set_bit(BIO_UPTODATE, &bio->bi_flags); | |
663 | bio_endio(bio, 0); | |
664 | return 0; | |
665 | } | |
666 | ||
667 | ||
668 | /* | |
669 | * Called when request page is not present in ramzswap. | |
670 | * Its either in backing swap device (if present) or | |
671 | * this is an attempt to read before any previous write | |
672 | * to this location - this happens due to readahead when | |
673 | * swap device is read from user-space (e.g. during swapon) | |
674 | */ | |
675 | static int handle_ramzswap_fault(struct ramzswap *rzs, struct bio *bio) | |
676 | { | |
677 | /* | |
678 | * Always forward such requests to backing swap | |
679 | * device (if present) | |
680 | */ | |
681 | if (rzs->backing_swap) { | |
682 | u32 pagenum; | |
6a907728 NG |
683 | rzs_stat64_dec(rzs, &rzs->stats.num_reads); |
684 | rzs_stat64_inc(rzs, &rzs->stats.bdev_num_reads); | |
306b0c95 NG |
685 | bio->bi_bdev = rzs->backing_swap; |
686 | ||
687 | /* | |
688 | * In case backing swap is a file, find the right offset within | |
689 | * the file corresponding to logical position 'index'. For block | |
690 | * device, this is a nop. | |
691 | */ | |
692 | pagenum = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT; | |
693 | bio->bi_sector = map_backing_swap_page(rzs, pagenum) | |
694 | << SECTORS_PER_PAGE_SHIFT; | |
695 | return 1; | |
696 | } | |
697 | ||
698 | /* | |
699 | * Its unlikely event in case backing dev is | |
700 | * not present | |
701 | */ | |
702 | pr_debug("Read before write on swap device: " | |
703 | "sector=%lu, size=%u, offset=%u\n", | |
704 | (ulong)(bio->bi_sector), bio->bi_size, | |
705 | bio->bi_io_vec[0].bv_offset); | |
706 | ||
707 | /* Do nothing. Just return success */ | |
708 | set_bit(BIO_UPTODATE, &bio->bi_flags); | |
709 | bio_endio(bio, 0); | |
710 | return 0; | |
711 | } | |
712 | ||
713 | static int ramzswap_read(struct ramzswap *rzs, struct bio *bio) | |
714 | { | |
715 | int ret; | |
716 | u32 index; | |
717 | size_t clen; | |
718 | struct page *page; | |
719 | struct zobj_header *zheader; | |
720 | unsigned char *user_mem, *cmem; | |
721 | ||
6a907728 | 722 | rzs_stat64_inc(rzs, &rzs->stats.num_reads); |
306b0c95 NG |
723 | |
724 | page = bio->bi_io_vec[0].bv_page; | |
725 | index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT; | |
726 | ||
727 | if (rzs_test_flag(rzs, index, RZS_ZERO)) | |
728 | return handle_zero_page(bio); | |
729 | ||
730 | /* Requested page is not present in compressed area */ | |
731 | if (!rzs->table[index].page) | |
732 | return handle_ramzswap_fault(rzs, bio); | |
733 | ||
ef4ffb7a | 734 | /* Page is stored uncompressed since it's incompressible */ |
306b0c95 NG |
735 | if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED))) |
736 | return handle_uncompressed_page(rzs, bio); | |
737 | ||
738 | user_mem = kmap_atomic(page, KM_USER0); | |
739 | clen = PAGE_SIZE; | |
740 | ||
741 | cmem = kmap_atomic(rzs->table[index].page, KM_USER1) + | |
742 | rzs->table[index].offset; | |
743 | ||
744 | ret = lzo1x_decompress_safe( | |
745 | cmem + sizeof(*zheader), | |
746 | xv_get_object_size(cmem) - sizeof(*zheader), | |
747 | user_mem, &clen); | |
748 | ||
749 | kunmap_atomic(user_mem, KM_USER0); | |
750 | kunmap_atomic(cmem, KM_USER1); | |
751 | ||
752 | /* should NEVER happen */ | |
753 | if (unlikely(ret != LZO_E_OK)) { | |
754 | pr_err("Decompression failed! err=%d, page=%u\n", | |
755 | ret, index); | |
6a907728 | 756 | rzs_stat64_inc(rzs, &rzs->stats.failed_reads); |
306b0c95 NG |
757 | goto out; |
758 | } | |
759 | ||
30fb8a71 | 760 | flush_dcache_page(page); |
306b0c95 NG |
761 | |
762 | set_bit(BIO_UPTODATE, &bio->bi_flags); | |
763 | bio_endio(bio, 0); | |
764 | return 0; | |
765 | ||
766 | out: | |
767 | bio_io_error(bio); | |
768 | return 0; | |
769 | } | |
770 | ||
771 | static int ramzswap_write(struct ramzswap *rzs, struct bio *bio) | |
772 | { | |
773 | int ret, fwd_write_request = 0; | |
774 | u32 offset, index; | |
775 | size_t clen; | |
776 | struct zobj_header *zheader; | |
777 | struct page *page, *page_store; | |
778 | unsigned char *user_mem, *cmem, *src; | |
779 | ||
6a907728 | 780 | rzs_stat64_inc(rzs, &rzs->stats.num_writes); |
306b0c95 NG |
781 | |
782 | page = bio->bi_io_vec[0].bv_page; | |
783 | index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT; | |
784 | ||
785 | src = rzs->compress_buffer; | |
786 | ||
787 | /* | |
788 | * System swaps to same sector again when the stored page | |
789 | * is no longer referenced by any process. So, its now safe | |
790 | * to free the memory that was allocated for this page. | |
791 | */ | |
792 | if (rzs->table[index].page) | |
793 | ramzswap_free_page(rzs, index); | |
794 | ||
795 | /* | |
ef4ffb7a | 796 | * No memory is allocated for zero filled pages. |
306b0c95 NG |
797 | * Simply clear zero page flag. |
798 | */ | |
799 | if (rzs_test_flag(rzs, index, RZS_ZERO)) { | |
6a907728 | 800 | rzs_stat_dec(&rzs->stats.pages_zero); |
306b0c95 NG |
801 | rzs_clear_flag(rzs, index, RZS_ZERO); |
802 | } | |
803 | ||
804 | mutex_lock(&rzs->lock); | |
805 | ||
806 | user_mem = kmap_atomic(page, KM_USER0); | |
807 | if (page_zero_filled(user_mem)) { | |
808 | kunmap_atomic(user_mem, KM_USER0); | |
809 | mutex_unlock(&rzs->lock); | |
6a907728 | 810 | rzs_stat_inc(&rzs->stats.pages_zero); |
306b0c95 NG |
811 | rzs_set_flag(rzs, index, RZS_ZERO); |
812 | ||
813 | set_bit(BIO_UPTODATE, &bio->bi_flags); | |
814 | bio_endio(bio, 0); | |
815 | return 0; | |
816 | } | |
817 | ||
818 | if (rzs->backing_swap && | |
819 | (rzs->stats.compr_size > rzs->memlimit - PAGE_SIZE)) { | |
820 | kunmap_atomic(user_mem, KM_USER0); | |
821 | mutex_unlock(&rzs->lock); | |
822 | fwd_write_request = 1; | |
823 | goto out; | |
824 | } | |
825 | ||
826 | ret = lzo1x_1_compress(user_mem, PAGE_SIZE, src, &clen, | |
827 | rzs->compress_workmem); | |
828 | ||
829 | kunmap_atomic(user_mem, KM_USER0); | |
830 | ||
831 | if (unlikely(ret != LZO_E_OK)) { | |
832 | mutex_unlock(&rzs->lock); | |
833 | pr_err("Compression failed! err=%d\n", ret); | |
6a907728 | 834 | rzs_stat64_inc(rzs, &rzs->stats.failed_writes); |
306b0c95 NG |
835 | goto out; |
836 | } | |
837 | ||
838 | /* | |
839 | * Page is incompressible. Forward it to backing swap | |
840 | * if present. Otherwise, store it as-is (uncompressed) | |
841 | * since we do not want to return too many swap write | |
842 | * errors which has side effect of hanging the system. | |
843 | */ | |
844 | if (unlikely(clen > max_zpage_size)) { | |
845 | if (rzs->backing_swap) { | |
846 | mutex_unlock(&rzs->lock); | |
847 | fwd_write_request = 1; | |
848 | goto out; | |
849 | } | |
850 | ||
851 | clen = PAGE_SIZE; | |
852 | page_store = alloc_page(GFP_NOIO | __GFP_HIGHMEM); | |
853 | if (unlikely(!page_store)) { | |
854 | mutex_unlock(&rzs->lock); | |
855 | pr_info("Error allocating memory for incompressible " | |
856 | "page: %u\n", index); | |
6a907728 | 857 | rzs_stat64_inc(rzs, &rzs->stats.failed_writes); |
306b0c95 NG |
858 | goto out; |
859 | } | |
860 | ||
861 | offset = 0; | |
862 | rzs_set_flag(rzs, index, RZS_UNCOMPRESSED); | |
6a907728 | 863 | rzs_stat_inc(&rzs->stats.pages_expand); |
306b0c95 NG |
864 | rzs->table[index].page = page_store; |
865 | src = kmap_atomic(page, KM_USER0); | |
866 | goto memstore; | |
867 | } | |
868 | ||
869 | if (xv_malloc(rzs->mem_pool, clen + sizeof(*zheader), | |
870 | &rzs->table[index].page, &offset, | |
871 | GFP_NOIO | __GFP_HIGHMEM)) { | |
872 | mutex_unlock(&rzs->lock); | |
873 | pr_info("Error allocating memory for compressed " | |
874 | "page: %u, size=%zu\n", index, clen); | |
6a907728 | 875 | rzs_stat64_inc(rzs, &rzs->stats.failed_writes); |
306b0c95 NG |
876 | if (rzs->backing_swap) |
877 | fwd_write_request = 1; | |
878 | goto out; | |
879 | } | |
880 | ||
881 | memstore: | |
882 | rzs->table[index].offset = offset; | |
883 | ||
884 | cmem = kmap_atomic(rzs->table[index].page, KM_USER1) + | |
885 | rzs->table[index].offset; | |
886 | ||
887 | #if 0 | |
888 | /* Back-reference needed for memory defragmentation */ | |
889 | if (!rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)) { | |
890 | zheader = (struct zobj_header *)cmem; | |
891 | zheader->table_idx = index; | |
892 | cmem += sizeof(*zheader); | |
893 | } | |
894 | #endif | |
895 | ||
896 | memcpy(cmem, src, clen); | |
897 | ||
898 | kunmap_atomic(cmem, KM_USER1); | |
899 | if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED))) | |
900 | kunmap_atomic(src, KM_USER0); | |
901 | ||
902 | /* Update stats */ | |
903 | rzs->stats.compr_size += clen; | |
6a907728 | 904 | rzs_stat_inc(&rzs->stats.pages_stored); |
306b0c95 | 905 | if (clen <= PAGE_SIZE / 2) |
6a907728 | 906 | rzs_stat_inc(&rzs->stats.good_compress); |
306b0c95 NG |
907 | |
908 | mutex_unlock(&rzs->lock); | |
909 | ||
910 | set_bit(BIO_UPTODATE, &bio->bi_flags); | |
911 | bio_endio(bio, 0); | |
912 | return 0; | |
913 | ||
914 | out: | |
915 | if (fwd_write_request) { | |
6a907728 | 916 | rzs_stat64_inc(rzs, &rzs->stats.bdev_num_writes); |
306b0c95 NG |
917 | bio->bi_bdev = rzs->backing_swap; |
918 | #if 0 | |
919 | /* | |
920 | * TODO: We currently have linear mapping of ramzswap and | |
921 | * backing swap sectors. This is not desired since we want | |
922 | * to optimize writes to backing swap to minimize disk seeks | |
923 | * or have effective wear leveling (for SSDs). Also, a | |
924 | * non-linear mapping is required to implement compressed | |
925 | * on-disk swapping. | |
926 | */ | |
927 | bio->bi_sector = get_backing_swap_page() | |
928 | << SECTORS_PER_PAGE_SHIFT; | |
929 | #endif | |
930 | /* | |
931 | * In case backing swap is a file, find the right offset within | |
932 | * the file corresponding to logical position 'index'. For block | |
933 | * device, this is a nop. | |
934 | */ | |
935 | bio->bi_sector = map_backing_swap_page(rzs, index) | |
936 | << SECTORS_PER_PAGE_SHIFT; | |
937 | return 1; | |
938 | } | |
939 | ||
940 | bio_io_error(bio); | |
941 | return 0; | |
942 | } | |
943 | ||
944 | ||
945 | /* | |
946 | * Check if request is within bounds and page aligned. | |
947 | */ | |
948 | static inline int valid_swap_request(struct ramzswap *rzs, struct bio *bio) | |
949 | { | |
950 | if (unlikely( | |
951 | (bio->bi_sector >= (rzs->disksize >> SECTOR_SHIFT)) || | |
952 | (bio->bi_sector & (SECTORS_PER_PAGE - 1)) || | |
953 | (bio->bi_vcnt != 1) || | |
954 | (bio->bi_size != PAGE_SIZE) || | |
955 | (bio->bi_io_vec[0].bv_offset != 0))) { | |
956 | ||
957 | return 0; | |
958 | } | |
959 | ||
960 | /* swap request is valid */ | |
961 | return 1; | |
962 | } | |
963 | ||
964 | /* | |
965 | * Handler function for all ramzswap I/O requests. | |
966 | */ | |
967 | static int ramzswap_make_request(struct request_queue *queue, struct bio *bio) | |
968 | { | |
969 | int ret = 0; | |
970 | struct ramzswap *rzs = queue->queuedata; | |
971 | ||
972 | if (unlikely(!rzs->init_done)) { | |
973 | bio_io_error(bio); | |
974 | return 0; | |
975 | } | |
976 | ||
977 | if (!valid_swap_request(rzs, bio)) { | |
6a907728 | 978 | rzs_stat64_inc(rzs, &rzs->stats.invalid_io); |
306b0c95 NG |
979 | bio_io_error(bio); |
980 | return 0; | |
981 | } | |
982 | ||
983 | switch (bio_data_dir(bio)) { | |
984 | case READ: | |
985 | ret = ramzswap_read(rzs, bio); | |
986 | break; | |
987 | ||
988 | case WRITE: | |
989 | ret = ramzswap_write(rzs, bio); | |
990 | break; | |
991 | } | |
992 | ||
993 | return ret; | |
994 | } | |
995 | ||
996 | static void reset_device(struct ramzswap *rzs) | |
997 | { | |
998 | int is_backing_blkdev = 0; | |
999 | size_t index, num_pages; | |
1000 | unsigned entries_per_page; | |
1001 | unsigned long num_table_pages, entry = 0; | |
1002 | ||
7eef7533 NG |
1003 | /* Do not accept any new I/O request */ |
1004 | rzs->init_done = 0; | |
1005 | ||
306b0c95 NG |
1006 | if (rzs->backing_swap && !rzs->num_extents) |
1007 | is_backing_blkdev = 1; | |
1008 | ||
1009 | num_pages = rzs->disksize >> PAGE_SHIFT; | |
1010 | ||
1011 | /* Free various per-device buffers */ | |
1012 | kfree(rzs->compress_workmem); | |
1013 | free_pages((unsigned long)rzs->compress_buffer, 1); | |
1014 | ||
1015 | rzs->compress_workmem = NULL; | |
1016 | rzs->compress_buffer = NULL; | |
1017 | ||
1018 | /* Free all pages that are still in this ramzswap device */ | |
1019 | for (index = 0; index < num_pages; index++) { | |
1020 | struct page *page; | |
1021 | u16 offset; | |
1022 | ||
1023 | page = rzs->table[index].page; | |
1024 | offset = rzs->table[index].offset; | |
1025 | ||
1026 | if (!page) | |
1027 | continue; | |
1028 | ||
1029 | if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED))) | |
1030 | __free_page(page); | |
1031 | else | |
1032 | xv_free(rzs->mem_pool, page, offset); | |
1033 | } | |
1034 | ||
1035 | entries_per_page = PAGE_SIZE / sizeof(*rzs->table); | |
1036 | num_table_pages = DIV_ROUND_UP(num_pages * sizeof(*rzs->table), | |
1037 | PAGE_SIZE); | |
1038 | /* | |
1039 | * Set page->mapping to NULL for every table page. | |
1040 | * Otherwise, we will hit bad_page() during free. | |
1041 | */ | |
1042 | while (rzs->num_extents && num_table_pages--) { | |
1043 | struct page *page; | |
1044 | page = vmalloc_to_page(&rzs->table[entry]); | |
1045 | page->mapping = NULL; | |
1046 | entry += entries_per_page; | |
1047 | } | |
1048 | vfree(rzs->table); | |
1049 | rzs->table = NULL; | |
1050 | ||
1051 | xv_destroy_pool(rzs->mem_pool); | |
1052 | rzs->mem_pool = NULL; | |
1053 | ||
1054 | /* Free all swap extent pages */ | |
1055 | while (!list_empty(&rzs->backing_swap_extent_list)) { | |
1056 | struct page *page; | |
1057 | struct list_head *entry; | |
1058 | entry = rzs->backing_swap_extent_list.next; | |
1059 | page = list_entry(entry, struct page, lru); | |
1060 | list_del(entry); | |
1061 | __free_page(page); | |
1062 | } | |
1063 | INIT_LIST_HEAD(&rzs->backing_swap_extent_list); | |
1064 | rzs->num_extents = 0; | |
1065 | ||
1066 | /* Close backing swap device, if present */ | |
1067 | if (rzs->backing_swap) { | |
1068 | if (is_backing_blkdev) | |
1069 | bd_release(rzs->backing_swap); | |
1070 | filp_close(rzs->swap_file, NULL); | |
1071 | rzs->backing_swap = NULL; | |
1072 | } | |
1073 | ||
1074 | /* Reset stats */ | |
1075 | memset(&rzs->stats, 0, sizeof(rzs->stats)); | |
1076 | ||
1077 | rzs->disksize = 0; | |
1078 | rzs->memlimit = 0; | |
306b0c95 NG |
1079 | } |
1080 | ||
1081 | static int ramzswap_ioctl_init_device(struct ramzswap *rzs) | |
1082 | { | |
1083 | int ret; | |
1084 | size_t num_pages; | |
1085 | struct page *page; | |
1086 | union swap_header *swap_header; | |
1087 | ||
1088 | if (rzs->init_done) { | |
1089 | pr_info("Device already initialized!\n"); | |
1090 | return -EBUSY; | |
1091 | } | |
1092 | ||
1093 | ret = setup_backing_swap(rzs); | |
1094 | if (ret) | |
1095 | goto fail; | |
1096 | ||
1097 | if (rzs->backing_swap) | |
1098 | ramzswap_set_memlimit(rzs, totalram_pages << PAGE_SHIFT); | |
1099 | else | |
1100 | ramzswap_set_disksize(rzs, totalram_pages << PAGE_SHIFT); | |
1101 | ||
1102 | rzs->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); | |
1103 | if (!rzs->compress_workmem) { | |
1104 | pr_err("Error allocating compressor working memory!\n"); | |
1105 | ret = -ENOMEM; | |
1106 | goto fail; | |
1107 | } | |
1108 | ||
1109 | rzs->compress_buffer = (void *)__get_free_pages(__GFP_ZERO, 1); | |
1110 | if (!rzs->compress_buffer) { | |
1111 | pr_err("Error allocating compressor buffer space\n"); | |
1112 | ret = -ENOMEM; | |
1113 | goto fail; | |
1114 | } | |
1115 | ||
1116 | num_pages = rzs->disksize >> PAGE_SHIFT; | |
1117 | rzs->table = vmalloc(num_pages * sizeof(*rzs->table)); | |
1118 | if (!rzs->table) { | |
1119 | pr_err("Error allocating ramzswap address table\n"); | |
1120 | /* To prevent accessing table entries during cleanup */ | |
1121 | rzs->disksize = 0; | |
1122 | ret = -ENOMEM; | |
1123 | goto fail; | |
1124 | } | |
1125 | memset(rzs->table, 0, num_pages * sizeof(*rzs->table)); | |
1126 | ||
1127 | map_backing_swap_extents(rzs); | |
1128 | ||
1129 | page = alloc_page(__GFP_ZERO); | |
1130 | if (!page) { | |
1131 | pr_err("Error allocating swap header page\n"); | |
1132 | ret = -ENOMEM; | |
1133 | goto fail; | |
1134 | } | |
1135 | rzs->table[0].page = page; | |
1136 | rzs_set_flag(rzs, 0, RZS_UNCOMPRESSED); | |
1137 | ||
1138 | swap_header = kmap(page); | |
1139 | ret = setup_swap_header(rzs, swap_header); | |
1140 | kunmap(page); | |
1141 | if (ret) { | |
1142 | pr_err("Error setting swap header\n"); | |
1143 | goto fail; | |
1144 | } | |
1145 | ||
1146 | set_capacity(rzs->disk, rzs->disksize >> SECTOR_SHIFT); | |
1147 | ||
1148 | /* | |
1149 | * We have ident mapping of sectors for ramzswap and | |
1150 | * and the backing swap device. So, this queue flag | |
1151 | * should be according to backing dev. | |
1152 | */ | |
1153 | if (!rzs->backing_swap || | |
1154 | blk_queue_nonrot(rzs->backing_swap->bd_disk->queue)) | |
1155 | queue_flag_set_unlocked(QUEUE_FLAG_NONROT, rzs->disk->queue); | |
1156 | ||
1157 | rzs->mem_pool = xv_create_pool(); | |
1158 | if (!rzs->mem_pool) { | |
1159 | pr_err("Error creating memory pool\n"); | |
1160 | ret = -ENOMEM; | |
1161 | goto fail; | |
1162 | } | |
1163 | ||
1164 | /* | |
1165 | * Pages that compress to size greater than this are forwarded | |
1166 | * to physical swap disk (if backing dev is provided) | |
1167 | * TODO: make this configurable | |
1168 | */ | |
1169 | if (rzs->backing_swap) | |
1170 | max_zpage_size = max_zpage_size_bdev; | |
1171 | else | |
1172 | max_zpage_size = max_zpage_size_nobdev; | |
1173 | pr_debug("Max compressed page size: %u bytes\n", max_zpage_size); | |
1174 | ||
1175 | rzs->init_done = 1; | |
1176 | ||
1177 | pr_debug("Initialization done!\n"); | |
1178 | return 0; | |
1179 | ||
1180 | fail: | |
1181 | reset_device(rzs); | |
1182 | ||
1183 | pr_err("Initialization failed: err=%d\n", ret); | |
1184 | return ret; | |
1185 | } | |
1186 | ||
1187 | static int ramzswap_ioctl_reset_device(struct ramzswap *rzs) | |
1188 | { | |
1189 | if (rzs->init_done) | |
1190 | reset_device(rzs); | |
1191 | ||
1192 | return 0; | |
1193 | } | |
1194 | ||
1195 | static int ramzswap_ioctl(struct block_device *bdev, fmode_t mode, | |
1196 | unsigned int cmd, unsigned long arg) | |
1197 | { | |
1198 | int ret = 0; | |
1199 | size_t disksize_kb, memlimit_kb; | |
1200 | ||
1201 | struct ramzswap *rzs = bdev->bd_disk->private_data; | |
1202 | ||
1203 | switch (cmd) { | |
1204 | case RZSIO_SET_DISKSIZE_KB: | |
1205 | if (rzs->init_done) { | |
1206 | ret = -EBUSY; | |
1207 | goto out; | |
1208 | } | |
1209 | if (copy_from_user(&disksize_kb, (void *)arg, | |
1210 | _IOC_SIZE(cmd))) { | |
1211 | ret = -EFAULT; | |
1212 | goto out; | |
1213 | } | |
1214 | rzs->disksize = disksize_kb << 10; | |
1215 | pr_info("Disk size set to %zu kB\n", disksize_kb); | |
1216 | break; | |
1217 | ||
1218 | case RZSIO_SET_MEMLIMIT_KB: | |
1219 | if (rzs->init_done) { | |
1220 | /* TODO: allow changing memlimit */ | |
1221 | ret = -EBUSY; | |
1222 | goto out; | |
1223 | } | |
1224 | if (copy_from_user(&memlimit_kb, (void *)arg, | |
1225 | _IOC_SIZE(cmd))) { | |
1226 | ret = -EFAULT; | |
1227 | goto out; | |
1228 | } | |
1229 | rzs->memlimit = memlimit_kb << 10; | |
1230 | pr_info("Memory limit set to %zu kB\n", memlimit_kb); | |
1231 | break; | |
1232 | ||
1233 | case RZSIO_SET_BACKING_SWAP: | |
1234 | if (rzs->init_done) { | |
1235 | ret = -EBUSY; | |
1236 | goto out; | |
1237 | } | |
1238 | ||
1239 | if (copy_from_user(&rzs->backing_swap_name, (void *)arg, | |
1240 | _IOC_SIZE(cmd))) { | |
1241 | ret = -EFAULT; | |
1242 | goto out; | |
1243 | } | |
1244 | rzs->backing_swap_name[MAX_SWAP_NAME_LEN - 1] = '\0'; | |
1245 | pr_info("Backing swap set to %s\n", rzs->backing_swap_name); | |
1246 | break; | |
1247 | ||
1248 | case RZSIO_GET_STATS: | |
1249 | { | |
1250 | struct ramzswap_ioctl_stats *stats; | |
1251 | if (!rzs->init_done) { | |
1252 | ret = -ENOTTY; | |
1253 | goto out; | |
1254 | } | |
1255 | stats = kzalloc(sizeof(*stats), GFP_KERNEL); | |
1256 | if (!stats) { | |
1257 | ret = -ENOMEM; | |
1258 | goto out; | |
1259 | } | |
1260 | ramzswap_ioctl_get_stats(rzs, stats); | |
1261 | if (copy_to_user((void *)arg, stats, sizeof(*stats))) { | |
1262 | kfree(stats); | |
1263 | ret = -EFAULT; | |
1264 | goto out; | |
1265 | } | |
1266 | kfree(stats); | |
1267 | break; | |
1268 | } | |
1269 | case RZSIO_INIT: | |
1270 | ret = ramzswap_ioctl_init_device(rzs); | |
1271 | break; | |
1272 | ||
1273 | case RZSIO_RESET: | |
1274 | /* Do not reset an active device! */ | |
1275 | if (bdev->bd_holders) { | |
1276 | ret = -EBUSY; | |
1277 | goto out; | |
1278 | } | |
7eef7533 NG |
1279 | |
1280 | /* Make sure all pending I/O is finished */ | |
1281 | if (bdev) | |
1282 | fsync_bdev(bdev); | |
1283 | ||
306b0c95 NG |
1284 | ret = ramzswap_ioctl_reset_device(rzs); |
1285 | break; | |
1286 | ||
1287 | default: | |
1288 | pr_info("Invalid ioctl %u\n", cmd); | |
1289 | ret = -ENOTTY; | |
1290 | } | |
1291 | ||
1292 | out: | |
1293 | return ret; | |
1294 | } | |
1295 | ||
1296 | static struct block_device_operations ramzswap_devops = { | |
1297 | .ioctl = ramzswap_ioctl, | |
1298 | .owner = THIS_MODULE, | |
1299 | }; | |
1300 | ||
3bf040c7 | 1301 | static int create_device(struct ramzswap *rzs, int device_id) |
306b0c95 NG |
1302 | { |
1303 | mutex_init(&rzs->lock); | |
6a907728 | 1304 | spin_lock_init(&rzs->stat64_lock); |
306b0c95 NG |
1305 | INIT_LIST_HEAD(&rzs->backing_swap_extent_list); |
1306 | ||
1307 | rzs->queue = blk_alloc_queue(GFP_KERNEL); | |
1308 | if (!rzs->queue) { | |
1309 | pr_err("Error allocating disk queue for device %d\n", | |
1310 | device_id); | |
3bf040c7 | 1311 | return 0; |
306b0c95 NG |
1312 | } |
1313 | ||
1314 | blk_queue_make_request(rzs->queue, ramzswap_make_request); | |
1315 | rzs->queue->queuedata = rzs; | |
1316 | ||
1317 | /* gendisk structure */ | |
1318 | rzs->disk = alloc_disk(1); | |
1319 | if (!rzs->disk) { | |
1320 | blk_cleanup_queue(rzs->queue); | |
1321 | pr_warning("Error allocating disk structure for device %d\n", | |
1322 | device_id); | |
3bf040c7 | 1323 | return 0; |
306b0c95 NG |
1324 | } |
1325 | ||
1326 | rzs->disk->major = ramzswap_major; | |
1327 | rzs->disk->first_minor = device_id; | |
1328 | rzs->disk->fops = &ramzswap_devops; | |
1329 | rzs->disk->queue = rzs->queue; | |
1330 | rzs->disk->private_data = rzs; | |
1331 | snprintf(rzs->disk->disk_name, 16, "ramzswap%d", device_id); | |
1332 | ||
1333 | /* | |
1334 | * Actual capacity set using RZSIO_SET_DISKSIZE_KB ioctl | |
1335 | * or set equal to backing swap device (if provided) | |
1336 | */ | |
1337 | set_capacity(rzs->disk, 0); | |
1338 | add_disk(rzs->disk); | |
1339 | ||
1340 | rzs->init_done = 0; | |
3bf040c7 | 1341 | return 1; |
306b0c95 NG |
1342 | } |
1343 | ||
1344 | static void destroy_device(struct ramzswap *rzs) | |
1345 | { | |
1346 | if (rzs->disk) { | |
1347 | del_gendisk(rzs->disk); | |
1348 | put_disk(rzs->disk); | |
1349 | } | |
1350 | ||
1351 | if (rzs->queue) | |
1352 | blk_cleanup_queue(rzs->queue); | |
1353 | } | |
1354 | ||
1355 | static int __init ramzswap_init(void) | |
1356 | { | |
1357 | int i, ret; | |
1358 | ||
1359 | if (num_devices > max_num_devices) { | |
1360 | pr_warning("Invalid value for num_devices: %u\n", | |
1361 | num_devices); | |
1362 | return -EINVAL; | |
1363 | } | |
1364 | ||
1365 | ramzswap_major = register_blkdev(0, "ramzswap"); | |
1366 | if (ramzswap_major <= 0) { | |
1367 | pr_warning("Unable to get major number\n"); | |
1368 | return -EBUSY; | |
1369 | } | |
1370 | ||
1371 | if (!num_devices) { | |
1372 | pr_info("num_devices not specified. Using default: 1\n"); | |
1373 | num_devices = 1; | |
1374 | } | |
1375 | ||
1376 | /* Allocate the device array and initialize each one */ | |
1377 | pr_info("Creating %u devices ...\n", num_devices); | |
1378 | devices = kzalloc(num_devices * sizeof(struct ramzswap), GFP_KERNEL); | |
3bf040c7 | 1379 | if (!devices) |
306b0c95 | 1380 | goto out; |
306b0c95 NG |
1381 | |
1382 | for (i = 0; i < num_devices; i++) | |
3bf040c7 MK |
1383 | if (!create_device(&devices[i], i)) { |
1384 | ret = i; | |
1385 | goto free_devices; | |
1386 | } | |
306b0c95 | 1387 | return 0; |
3bf040c7 MK |
1388 | free_devices: |
1389 | for (i = 0; i < ret; i++) | |
1390 | destroy_device(&devices[i]); | |
306b0c95 | 1391 | out: |
3bf040c7 | 1392 | ret = -ENOMEM; |
306b0c95 NG |
1393 | unregister_blkdev(ramzswap_major, "ramzswap"); |
1394 | return ret; | |
1395 | } | |
1396 | ||
1397 | static void __exit ramzswap_exit(void) | |
1398 | { | |
1399 | int i; | |
1400 | struct ramzswap *rzs; | |
1401 | ||
1402 | for (i = 0; i < num_devices; i++) { | |
1403 | rzs = &devices[i]; | |
1404 | ||
1405 | destroy_device(rzs); | |
1406 | if (rzs->init_done) | |
1407 | reset_device(rzs); | |
1408 | } | |
1409 | ||
1410 | unregister_blkdev(ramzswap_major, "ramzswap"); | |
1411 | ||
1412 | kfree(devices); | |
1413 | pr_debug("Cleanup done!\n"); | |
1414 | } | |
1415 | ||
1416 | module_param(num_devices, uint, 0); | |
1417 | MODULE_PARM_DESC(num_devices, "Number of ramzswap devices"); | |
1418 | ||
1419 | module_init(ramzswap_init); | |
1420 | module_exit(ramzswap_exit); | |
1421 | ||
1422 | MODULE_LICENSE("Dual BSD/GPL"); | |
1423 | MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>"); | |
1424 | MODULE_DESCRIPTION("Compressed RAM Based Swap Device"); |