]>
Commit | Line | Data |
---|---|---|
306b0c95 NG |
1 | /* |
2 | * Compressed RAM based swap device | |
3 | * | |
1130ebba | 4 | * Copyright (C) 2008, 2009, 2010 Nitin Gupta |
306b0c95 NG |
5 | * |
6 | * This code is released using a dual license strategy: BSD/GPL | |
7 | * You can choose the licence that better fits your requirements. | |
8 | * | |
9 | * Released under the terms of 3-clause BSD License | |
10 | * Released under the terms of GNU General Public License Version 2.0 | |
11 | * | |
12 | * Project home: http://compcache.googlecode.com | |
13 | */ | |
14 | ||
15 | #define KMSG_COMPONENT "ramzswap" | |
16 | #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt | |
17 | ||
18 | #include <linux/module.h> | |
19 | #include <linux/kernel.h> | |
20 | #include <linux/bitops.h> | |
21 | #include <linux/blkdev.h> | |
22 | #include <linux/buffer_head.h> | |
23 | #include <linux/device.h> | |
24 | #include <linux/genhd.h> | |
25 | #include <linux/highmem.h> | |
5a0e3ad6 | 26 | #include <linux/slab.h> |
306b0c95 | 27 | #include <linux/lzo.h> |
306b0c95 NG |
28 | #include <linux/string.h> |
29 | #include <linux/swap.h> | |
30 | #include <linux/swapops.h> | |
31 | #include <linux/vmalloc.h> | |
306b0c95 NG |
32 | |
33 | #include "ramzswap_drv.h" | |
34 | ||
35 | /* Globals */ | |
36 | static int ramzswap_major; | |
37 | static struct ramzswap *devices; | |
38 | ||
39 | /* | |
40 | * Pages that compress to larger than this size are | |
41 | * forwarded to backing swap, if present or stored | |
42 | * uncompressed in memory otherwise. | |
43 | */ | |
44 | static unsigned int max_zpage_size; | |
45 | ||
46 | /* Module params (documentation at end) */ | |
47 | static unsigned int num_devices; | |
48 | ||
49 | static int rzs_test_flag(struct ramzswap *rzs, u32 index, | |
50 | enum rzs_pageflags flag) | |
51 | { | |
52 | return rzs->table[index].flags & BIT(flag); | |
53 | } | |
54 | ||
55 | static void rzs_set_flag(struct ramzswap *rzs, u32 index, | |
56 | enum rzs_pageflags flag) | |
57 | { | |
58 | rzs->table[index].flags |= BIT(flag); | |
59 | } | |
60 | ||
61 | static void rzs_clear_flag(struct ramzswap *rzs, u32 index, | |
62 | enum rzs_pageflags flag) | |
63 | { | |
64 | rzs->table[index].flags &= ~BIT(flag); | |
65 | } | |
66 | ||
67 | static int page_zero_filled(void *ptr) | |
68 | { | |
69 | unsigned int pos; | |
70 | unsigned long *page; | |
71 | ||
72 | page = (unsigned long *)ptr; | |
73 | ||
74 | for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) { | |
75 | if (page[pos]) | |
76 | return 0; | |
77 | } | |
78 | ||
79 | return 1; | |
80 | } | |
81 | ||
82 | /* | |
83 | * memlimit cannot be greater than backing disk size. | |
84 | */ | |
85 | static void ramzswap_set_memlimit(struct ramzswap *rzs, size_t totalram_bytes) | |
86 | { | |
87 | int memlimit_valid = 1; | |
88 | ||
89 | if (!rzs->memlimit) { | |
90 | pr_info("Memory limit not set.\n"); | |
91 | memlimit_valid = 0; | |
92 | } | |
93 | ||
94 | if (rzs->memlimit > rzs->disksize) { | |
95 | pr_info("Memory limit cannot be greater than " | |
96 | "disksize: limit=%zu, disksize=%zu\n", | |
97 | rzs->memlimit, rzs->disksize); | |
98 | memlimit_valid = 0; | |
99 | } | |
100 | ||
101 | if (!memlimit_valid) { | |
102 | size_t mempart, disksize; | |
103 | pr_info("Using default: smaller of (%u%% of RAM) and " | |
104 | "(backing disk size).\n", | |
105 | default_memlimit_perc_ram); | |
106 | mempart = default_memlimit_perc_ram * (totalram_bytes / 100); | |
107 | disksize = rzs->disksize; | |
108 | rzs->memlimit = mempart > disksize ? disksize : mempart; | |
109 | } | |
110 | ||
111 | if (rzs->memlimit > totalram_bytes / 2) { | |
112 | pr_info( | |
113 | "Its not advisable setting limit more than half of " | |
114 | "size of memory since we expect a 2:1 compression ratio. " | |
115 | "Limit represents amount of *compressed* data we can keep " | |
116 | "in memory!\n" | |
117 | "\tMemory Size: %zu kB\n" | |
118 | "\tLimit you selected: %zu kB\n" | |
119 | "Continuing anyway ...\n", | |
120 | totalram_bytes >> 10, rzs->memlimit >> 10 | |
121 | ); | |
122 | } | |
123 | ||
124 | rzs->memlimit &= PAGE_MASK; | |
125 | BUG_ON(!rzs->memlimit); | |
126 | } | |
127 | ||
128 | static void ramzswap_set_disksize(struct ramzswap *rzs, size_t totalram_bytes) | |
129 | { | |
130 | if (!rzs->disksize) { | |
131 | pr_info( | |
132 | "disk size not provided. You can use disksize_kb module " | |
133 | "param to specify size.\nUsing default: (%u%% of RAM).\n", | |
134 | default_disksize_perc_ram | |
135 | ); | |
136 | rzs->disksize = default_disksize_perc_ram * | |
137 | (totalram_bytes / 100); | |
138 | } | |
139 | ||
140 | if (rzs->disksize > 2 * (totalram_bytes)) { | |
141 | pr_info( | |
142 | "There is little point creating a ramzswap of greater than " | |
143 | "twice the size of memory since we expect a 2:1 compression " | |
144 | "ratio. Note that ramzswap uses about 0.1%% of the size of " | |
145 | "the swap device when not in use so a huge ramzswap is " | |
146 | "wasteful.\n" | |
147 | "\tMemory Size: %zu kB\n" | |
148 | "\tSize you selected: %zu kB\n" | |
149 | "Continuing anyway ...\n", | |
150 | totalram_bytes >> 10, rzs->disksize | |
151 | ); | |
152 | } | |
153 | ||
154 | rzs->disksize &= PAGE_MASK; | |
155 | } | |
156 | ||
157 | /* | |
158 | * Swap header (1st page of swap device) contains information | |
159 | * to indentify it as a swap partition. Prepare such a header | |
160 | * for ramzswap device (ramzswap0) so that swapon can identify | |
161 | * it as swap partition. In case backing swap device is provided, | |
162 | * copy its swap header. | |
163 | */ | |
164 | static int setup_swap_header(struct ramzswap *rzs, union swap_header *s) | |
165 | { | |
166 | int ret = 0; | |
167 | struct page *page; | |
168 | struct address_space *mapping; | |
169 | union swap_header *backing_swap_header; | |
170 | ||
171 | /* | |
172 | * There is no backing swap device. Create a swap header | |
173 | * that is acceptable by swapon. | |
174 | */ | |
175 | if (!rzs->backing_swap) { | |
176 | s->info.version = 1; | |
177 | s->info.last_page = (rzs->disksize >> PAGE_SHIFT) - 1; | |
178 | s->info.nr_badpages = 0; | |
179 | memcpy(s->magic.magic, "SWAPSPACE2", 10); | |
180 | return 0; | |
181 | } | |
182 | ||
183 | /* | |
184 | * We have a backing swap device. Copy its swap header | |
185 | * to ramzswap device header. If this header contains | |
186 | * invalid information (backing device not a swap | |
187 | * partition, etc.), swapon will fail for ramzswap | |
188 | * which is correct behavior - we don't want to swap | |
189 | * over filesystem partition! | |
190 | */ | |
191 | ||
192 | /* Read the backing swap header (code from sys_swapon) */ | |
193 | mapping = rzs->swap_file->f_mapping; | |
194 | if (!mapping->a_ops->readpage) { | |
195 | ret = -EINVAL; | |
196 | goto out; | |
197 | } | |
198 | ||
199 | page = read_mapping_page(mapping, 0, rzs->swap_file); | |
200 | if (IS_ERR(page)) { | |
201 | ret = PTR_ERR(page); | |
202 | goto out; | |
203 | } | |
204 | ||
205 | backing_swap_header = kmap(page); | |
206 | memcpy(s, backing_swap_header, sizeof(*s)); | |
207 | if (s->info.nr_badpages) { | |
208 | pr_info("Cannot use backing swap with bad pages (%u)\n", | |
209 | s->info.nr_badpages); | |
210 | ret = -EINVAL; | |
211 | } | |
212 | /* | |
213 | * ramzswap disksize equals number of usable pages in backing | |
214 | * swap. Set last_page in swap header to match this disksize | |
215 | * ('last_page' means 0-based index of last usable swap page). | |
216 | */ | |
217 | s->info.last_page = (rzs->disksize >> PAGE_SHIFT) - 1; | |
218 | kunmap(page); | |
219 | ||
220 | out: | |
221 | return ret; | |
222 | } | |
223 | ||
c25d75a2 | 224 | static void ramzswap_ioctl_get_stats(struct ramzswap *rzs, |
306b0c95 NG |
225 | struct ramzswap_ioctl_stats *s) |
226 | { | |
227 | strncpy(s->backing_swap_name, rzs->backing_swap_name, | |
228 | MAX_SWAP_NAME_LEN - 1); | |
229 | s->backing_swap_name[MAX_SWAP_NAME_LEN - 1] = '\0'; | |
230 | ||
231 | s->disksize = rzs->disksize; | |
232 | s->memlimit = rzs->memlimit; | |
233 | ||
234 | #if defined(CONFIG_RAMZSWAP_STATS) | |
235 | { | |
236 | struct ramzswap_stats *rs = &rzs->stats; | |
237 | size_t succ_writes, mem_used; | |
238 | unsigned int good_compress_perc = 0, no_compress_perc = 0; | |
239 | ||
240 | mem_used = xv_get_total_size_bytes(rzs->mem_pool) | |
241 | + (rs->pages_expand << PAGE_SHIFT); | |
6a907728 NG |
242 | succ_writes = rzs_stat64_read(rzs, &rs->num_writes) - |
243 | rzs_stat64_read(rzs, &rs->failed_writes); | |
306b0c95 NG |
244 | |
245 | if (succ_writes && rs->pages_stored) { | |
246 | good_compress_perc = rs->good_compress * 100 | |
247 | / rs->pages_stored; | |
248 | no_compress_perc = rs->pages_expand * 100 | |
249 | / rs->pages_stored; | |
250 | } | |
251 | ||
6a907728 NG |
252 | s->num_reads = rzs_stat64_read(rzs, &rs->num_reads); |
253 | s->num_writes = rzs_stat64_read(rzs, &rs->num_writes); | |
254 | s->failed_reads = rzs_stat64_read(rzs, &rs->failed_reads); | |
255 | s->failed_writes = rzs_stat64_read(rzs, &rs->failed_writes); | |
256 | s->invalid_io = rzs_stat64_read(rzs, &rs->invalid_io); | |
257 | s->notify_free = rzs_stat64_read(rzs, &rs->notify_free); | |
306b0c95 NG |
258 | s->pages_zero = rs->pages_zero; |
259 | ||
260 | s->good_compress_pct = good_compress_perc; | |
261 | s->pages_expand_pct = no_compress_perc; | |
262 | ||
263 | s->pages_stored = rs->pages_stored; | |
264 | s->pages_used = mem_used >> PAGE_SHIFT; | |
265 | s->orig_data_size = rs->pages_stored << PAGE_SHIFT; | |
266 | s->compr_data_size = rs->compr_size; | |
267 | s->mem_used_total = mem_used; | |
268 | ||
6a907728 NG |
269 | s->bdev_num_reads = rzs_stat64_read(rzs, &rs->bdev_num_reads); |
270 | s->bdev_num_writes = rzs_stat64_read(rzs, &rs->bdev_num_writes); | |
306b0c95 NG |
271 | } |
272 | #endif /* CONFIG_RAMZSWAP_STATS */ | |
273 | } | |
274 | ||
275 | static int add_backing_swap_extent(struct ramzswap *rzs, | |
276 | pgoff_t phy_pagenum, | |
277 | pgoff_t num_pages) | |
278 | { | |
279 | unsigned int idx; | |
280 | struct list_head *head; | |
281 | struct page *curr_page, *new_page; | |
282 | unsigned int extents_per_page = PAGE_SIZE / | |
283 | sizeof(struct ramzswap_backing_extent); | |
284 | ||
285 | idx = rzs->num_extents % extents_per_page; | |
286 | if (!idx) { | |
287 | new_page = alloc_page(__GFP_ZERO); | |
288 | if (!new_page) | |
289 | return -ENOMEM; | |
290 | ||
291 | if (rzs->num_extents) { | |
292 | curr_page = virt_to_page(rzs->curr_extent); | |
293 | head = &curr_page->lru; | |
294 | } else { | |
295 | head = &rzs->backing_swap_extent_list; | |
296 | } | |
297 | ||
298 | list_add(&new_page->lru, head); | |
299 | rzs->curr_extent = page_address(new_page); | |
300 | } | |
301 | ||
302 | rzs->curr_extent->phy_pagenum = phy_pagenum; | |
303 | rzs->curr_extent->num_pages = num_pages; | |
304 | ||
305 | pr_debug("add_extent: idx=%u, phy_pgnum=%lu, num_pgs=%lu, " | |
306 | "pg_last=%lu, curr_ext=%p\n", idx, phy_pagenum, num_pages, | |
307 | phy_pagenum + num_pages - 1, rzs->curr_extent); | |
308 | ||
309 | if (idx != extents_per_page - 1) | |
310 | rzs->curr_extent++; | |
311 | ||
312 | return 0; | |
313 | } | |
314 | ||
315 | static int setup_backing_swap_extents(struct ramzswap *rzs, | |
316 | struct inode *inode, unsigned long *num_pages) | |
317 | { | |
318 | int ret = 0; | |
319 | unsigned blkbits; | |
320 | unsigned blocks_per_page; | |
321 | pgoff_t contig_pages = 0, total_pages = 0; | |
322 | pgoff_t pagenum = 0, prev_pagenum = 0; | |
323 | sector_t probe_block = 0; | |
324 | sector_t last_block; | |
325 | ||
326 | blkbits = inode->i_blkbits; | |
327 | blocks_per_page = PAGE_SIZE >> blkbits; | |
328 | ||
329 | last_block = i_size_read(inode) >> blkbits; | |
330 | while (probe_block + blocks_per_page <= last_block) { | |
331 | unsigned block_in_page; | |
332 | sector_t first_block; | |
333 | ||
334 | first_block = bmap(inode, probe_block); | |
335 | if (first_block == 0) | |
336 | goto bad_bmap; | |
337 | ||
338 | /* It must be PAGE_SIZE aligned on-disk */ | |
339 | if (first_block & (blocks_per_page - 1)) { | |
340 | probe_block++; | |
341 | goto probe_next; | |
342 | } | |
343 | ||
344 | /* All blocks within this page must be contiguous on disk */ | |
345 | for (block_in_page = 1; block_in_page < blocks_per_page; | |
346 | block_in_page++) { | |
347 | sector_t block; | |
348 | ||
349 | block = bmap(inode, probe_block + block_in_page); | |
350 | if (block == 0) | |
351 | goto bad_bmap; | |
352 | if (block != first_block + block_in_page) { | |
353 | /* Discontiguity */ | |
354 | probe_block++; | |
355 | goto probe_next; | |
356 | } | |
357 | } | |
358 | ||
359 | /* | |
360 | * We found a PAGE_SIZE length, PAGE_SIZE aligned | |
361 | * run of blocks. | |
362 | */ | |
363 | pagenum = first_block >> (PAGE_SHIFT - blkbits); | |
364 | ||
365 | if (total_pages && (pagenum != prev_pagenum + 1)) { | |
366 | ret = add_backing_swap_extent(rzs, prev_pagenum - | |
367 | (contig_pages - 1), contig_pages); | |
368 | if (ret < 0) | |
369 | goto out; | |
370 | rzs->num_extents++; | |
371 | contig_pages = 0; | |
372 | } | |
373 | total_pages++; | |
374 | contig_pages++; | |
375 | prev_pagenum = pagenum; | |
376 | probe_block += blocks_per_page; | |
377 | ||
378 | probe_next: | |
379 | continue; | |
380 | } | |
381 | ||
382 | if (contig_pages) { | |
383 | pr_debug("adding last extent: pagenum=%lu, " | |
384 | "contig_pages=%lu\n", pagenum, contig_pages); | |
385 | ret = add_backing_swap_extent(rzs, | |
386 | prev_pagenum - (contig_pages - 1), contig_pages); | |
387 | if (ret < 0) | |
388 | goto out; | |
389 | rzs->num_extents++; | |
390 | } | |
391 | if (!rzs->num_extents) { | |
392 | pr_err("No swap extents found!\n"); | |
393 | ret = -EINVAL; | |
394 | } | |
395 | ||
396 | if (!ret) { | |
397 | *num_pages = total_pages; | |
398 | pr_info("Found %lu extents containing %luk\n", | |
399 | rzs->num_extents, *num_pages << (PAGE_SHIFT - 10)); | |
400 | } | |
401 | goto out; | |
402 | ||
403 | bad_bmap: | |
404 | pr_err("Backing swapfile has holes\n"); | |
405 | ret = -EINVAL; | |
406 | out: | |
407 | while (ret && !list_empty(&rzs->backing_swap_extent_list)) { | |
408 | struct page *page; | |
409 | struct list_head *entry = rzs->backing_swap_extent_list.next; | |
410 | page = list_entry(entry, struct page, lru); | |
411 | list_del(entry); | |
412 | __free_page(page); | |
413 | } | |
414 | return ret; | |
415 | } | |
416 | ||
417 | static void map_backing_swap_extents(struct ramzswap *rzs) | |
418 | { | |
419 | struct ramzswap_backing_extent *se; | |
420 | struct page *table_page, *se_page; | |
421 | unsigned long num_pages, num_table_pages, entry; | |
422 | unsigned long se_idx, span; | |
423 | unsigned entries_per_page = PAGE_SIZE / sizeof(*rzs->table); | |
424 | unsigned extents_per_page = PAGE_SIZE / sizeof(*se); | |
425 | ||
426 | /* True for block device */ | |
427 | if (!rzs->num_extents) | |
428 | return; | |
429 | ||
430 | se_page = list_entry(rzs->backing_swap_extent_list.next, | |
431 | struct page, lru); | |
432 | se = page_address(se_page); | |
433 | span = se->num_pages; | |
434 | num_pages = rzs->disksize >> PAGE_SHIFT; | |
435 | num_table_pages = DIV_ROUND_UP(num_pages * sizeof(*rzs->table), | |
436 | PAGE_SIZE); | |
437 | ||
438 | entry = 0; | |
439 | se_idx = 0; | |
440 | while (num_table_pages--) { | |
441 | table_page = vmalloc_to_page(&rzs->table[entry]); | |
442 | while (span <= entry) { | |
443 | se_idx++; | |
444 | if (se_idx == rzs->num_extents) | |
445 | BUG(); | |
446 | ||
447 | if (!(se_idx % extents_per_page)) { | |
448 | se_page = list_entry(se_page->lru.next, | |
449 | struct page, lru); | |
450 | se = page_address(se_page); | |
451 | } else | |
452 | se++; | |
453 | ||
454 | span += se->num_pages; | |
455 | } | |
456 | table_page->mapping = (struct address_space *)se; | |
457 | table_page->private = se->num_pages - (span - entry); | |
458 | pr_debug("map_table: entry=%lu, span=%lu, map=%p, priv=%lu\n", | |
459 | entry, span, table_page->mapping, table_page->private); | |
460 | entry += entries_per_page; | |
461 | } | |
462 | } | |
463 | ||
464 | /* | |
465 | * Check if value of backing_swap module param is sane. | |
466 | * Claim this device and set ramzswap size equal to | |
467 | * size of this block device. | |
468 | */ | |
469 | static int setup_backing_swap(struct ramzswap *rzs) | |
470 | { | |
471 | int ret = 0; | |
472 | size_t disksize; | |
473 | unsigned long num_pages = 0; | |
474 | struct inode *inode; | |
475 | struct file *swap_file; | |
476 | struct address_space *mapping; | |
477 | struct block_device *bdev = NULL; | |
478 | ||
479 | if (!rzs->backing_swap_name[0]) { | |
480 | pr_debug("backing_swap param not given\n"); | |
481 | goto out; | |
482 | } | |
483 | ||
484 | pr_info("Using backing swap device: %s\n", rzs->backing_swap_name); | |
485 | ||
486 | swap_file = filp_open(rzs->backing_swap_name, | |
487 | O_RDWR | O_LARGEFILE, 0); | |
488 | if (IS_ERR(swap_file)) { | |
489 | pr_err("Error opening backing device: %s\n", | |
490 | rzs->backing_swap_name); | |
491 | ret = -EINVAL; | |
492 | goto out; | |
493 | } | |
494 | ||
495 | mapping = swap_file->f_mapping; | |
496 | inode = mapping->host; | |
497 | ||
498 | if (S_ISBLK(inode->i_mode)) { | |
499 | bdev = I_BDEV(inode); | |
500 | ret = bd_claim(bdev, setup_backing_swap); | |
501 | if (ret < 0) { | |
502 | bdev = NULL; | |
503 | goto bad_param; | |
504 | } | |
505 | disksize = i_size_read(inode); | |
c25d75a2 NG |
506 | /* |
507 | * Can happen if user gives an extended partition as | |
508 | * backing swap or simply a bad disk. | |
509 | */ | |
510 | if (!disksize) { | |
511 | pr_err("Error reading backing swap size.\n"); | |
512 | goto bad_param; | |
513 | } | |
306b0c95 NG |
514 | } else if (S_ISREG(inode->i_mode)) { |
515 | bdev = inode->i_sb->s_bdev; | |
516 | if (IS_SWAPFILE(inode)) { | |
517 | ret = -EBUSY; | |
518 | goto bad_param; | |
519 | } | |
520 | ret = setup_backing_swap_extents(rzs, inode, &num_pages); | |
521 | if (ret < 0) | |
522 | goto bad_param; | |
523 | disksize = num_pages << PAGE_SHIFT; | |
524 | } else { | |
525 | goto bad_param; | |
526 | } | |
527 | ||
528 | rzs->swap_file = swap_file; | |
529 | rzs->backing_swap = bdev; | |
530 | rzs->disksize = disksize; | |
306b0c95 NG |
531 | |
532 | return 0; | |
533 | ||
534 | bad_param: | |
535 | if (bdev) | |
536 | bd_release(bdev); | |
537 | filp_close(swap_file, NULL); | |
538 | ||
539 | out: | |
540 | rzs->backing_swap = NULL; | |
541 | return ret; | |
542 | } | |
543 | ||
544 | /* | |
545 | * Map logical page number 'pagenum' to physical page number | |
546 | * on backing swap device. For block device, this is a nop. | |
547 | */ | |
c25d75a2 | 548 | static u32 map_backing_swap_page(struct ramzswap *rzs, u32 pagenum) |
306b0c95 NG |
549 | { |
550 | u32 skip_pages, entries_per_page; | |
551 | size_t delta, se_offset, skipped; | |
552 | struct page *table_page, *se_page; | |
553 | struct ramzswap_backing_extent *se; | |
554 | ||
555 | if (!rzs->num_extents) | |
556 | return pagenum; | |
557 | ||
558 | entries_per_page = PAGE_SIZE / sizeof(*rzs->table); | |
559 | ||
560 | table_page = vmalloc_to_page(&rzs->table[pagenum]); | |
561 | se = (struct ramzswap_backing_extent *)table_page->mapping; | |
562 | se_page = virt_to_page(se); | |
563 | ||
564 | skip_pages = pagenum - (pagenum / entries_per_page * entries_per_page); | |
565 | se_offset = table_page->private + skip_pages; | |
566 | ||
567 | if (se_offset < se->num_pages) | |
568 | return se->phy_pagenum + se_offset; | |
569 | ||
570 | skipped = se->num_pages - table_page->private; | |
571 | do { | |
572 | struct ramzswap_backing_extent *se_base; | |
573 | u32 se_entries_per_page = PAGE_SIZE / sizeof(*se); | |
574 | ||
575 | /* Get next swap extent */ | |
576 | se_base = (struct ramzswap_backing_extent *) | |
577 | page_address(se_page); | |
578 | if (se - se_base == se_entries_per_page - 1) { | |
579 | se_page = list_entry(se_page->lru.next, | |
580 | struct page, lru); | |
581 | se = page_address(se_page); | |
582 | } else { | |
583 | se++; | |
584 | } | |
585 | ||
586 | skipped += se->num_pages; | |
587 | } while (skipped < skip_pages); | |
588 | ||
589 | delta = skipped - skip_pages; | |
590 | se_offset = se->num_pages - delta; | |
591 | ||
592 | return se->phy_pagenum + se_offset; | |
593 | } | |
594 | ||
595 | static void ramzswap_free_page(struct ramzswap *rzs, size_t index) | |
596 | { | |
597 | u32 clen; | |
598 | void *obj; | |
599 | ||
600 | struct page *page = rzs->table[index].page; | |
601 | u32 offset = rzs->table[index].offset; | |
602 | ||
603 | if (unlikely(!page)) { | |
2e882281 NG |
604 | /* |
605 | * No memory is allocated for zero filled pages. | |
606 | * Simply clear zero page flag. | |
607 | */ | |
306b0c95 NG |
608 | if (rzs_test_flag(rzs, index, RZS_ZERO)) { |
609 | rzs_clear_flag(rzs, index, RZS_ZERO); | |
6a907728 | 610 | rzs_stat_dec(&rzs->stats.pages_zero); |
306b0c95 NG |
611 | } |
612 | return; | |
613 | } | |
614 | ||
615 | if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED))) { | |
616 | clen = PAGE_SIZE; | |
617 | __free_page(page); | |
618 | rzs_clear_flag(rzs, index, RZS_UNCOMPRESSED); | |
6a907728 | 619 | rzs_stat_dec(&rzs->stats.pages_expand); |
306b0c95 NG |
620 | goto out; |
621 | } | |
622 | ||
623 | obj = kmap_atomic(page, KM_USER0) + offset; | |
624 | clen = xv_get_object_size(obj) - sizeof(struct zobj_header); | |
625 | kunmap_atomic(obj, KM_USER0); | |
626 | ||
627 | xv_free(rzs->mem_pool, page, offset); | |
628 | if (clen <= PAGE_SIZE / 2) | |
6a907728 | 629 | rzs_stat_dec(&rzs->stats.good_compress); |
306b0c95 NG |
630 | |
631 | out: | |
632 | rzs->stats.compr_size -= clen; | |
6a907728 | 633 | rzs_stat_dec(&rzs->stats.pages_stored); |
306b0c95 NG |
634 | |
635 | rzs->table[index].page = NULL; | |
636 | rzs->table[index].offset = 0; | |
637 | } | |
638 | ||
639 | static int handle_zero_page(struct bio *bio) | |
640 | { | |
641 | void *user_mem; | |
642 | struct page *page = bio->bi_io_vec[0].bv_page; | |
643 | ||
644 | user_mem = kmap_atomic(page, KM_USER0); | |
645 | memset(user_mem, 0, PAGE_SIZE); | |
646 | kunmap_atomic(user_mem, KM_USER0); | |
647 | ||
30fb8a71 | 648 | flush_dcache_page(page); |
306b0c95 NG |
649 | |
650 | set_bit(BIO_UPTODATE, &bio->bi_flags); | |
651 | bio_endio(bio, 0); | |
652 | return 0; | |
653 | } | |
654 | ||
655 | static int handle_uncompressed_page(struct ramzswap *rzs, struct bio *bio) | |
656 | { | |
657 | u32 index; | |
658 | struct page *page; | |
659 | unsigned char *user_mem, *cmem; | |
660 | ||
661 | page = bio->bi_io_vec[0].bv_page; | |
662 | index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT; | |
663 | ||
664 | user_mem = kmap_atomic(page, KM_USER0); | |
665 | cmem = kmap_atomic(rzs->table[index].page, KM_USER1) + | |
666 | rzs->table[index].offset; | |
667 | ||
668 | memcpy(user_mem, cmem, PAGE_SIZE); | |
669 | kunmap_atomic(user_mem, KM_USER0); | |
670 | kunmap_atomic(cmem, KM_USER1); | |
671 | ||
30fb8a71 | 672 | flush_dcache_page(page); |
306b0c95 NG |
673 | |
674 | set_bit(BIO_UPTODATE, &bio->bi_flags); | |
675 | bio_endio(bio, 0); | |
676 | return 0; | |
677 | } | |
678 | ||
306b0c95 NG |
679 | /* |
680 | * Called when request page is not present in ramzswap. | |
681 | * Its either in backing swap device (if present) or | |
682 | * this is an attempt to read before any previous write | |
683 | * to this location - this happens due to readahead when | |
684 | * swap device is read from user-space (e.g. during swapon) | |
685 | */ | |
686 | static int handle_ramzswap_fault(struct ramzswap *rzs, struct bio *bio) | |
687 | { | |
688 | /* | |
689 | * Always forward such requests to backing swap | |
690 | * device (if present) | |
691 | */ | |
692 | if (rzs->backing_swap) { | |
693 | u32 pagenum; | |
6a907728 NG |
694 | rzs_stat64_dec(rzs, &rzs->stats.num_reads); |
695 | rzs_stat64_inc(rzs, &rzs->stats.bdev_num_reads); | |
306b0c95 NG |
696 | bio->bi_bdev = rzs->backing_swap; |
697 | ||
698 | /* | |
699 | * In case backing swap is a file, find the right offset within | |
700 | * the file corresponding to logical position 'index'. For block | |
701 | * device, this is a nop. | |
702 | */ | |
703 | pagenum = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT; | |
704 | bio->bi_sector = map_backing_swap_page(rzs, pagenum) | |
705 | << SECTORS_PER_PAGE_SHIFT; | |
706 | return 1; | |
707 | } | |
708 | ||
709 | /* | |
710 | * Its unlikely event in case backing dev is | |
711 | * not present | |
712 | */ | |
713 | pr_debug("Read before write on swap device: " | |
714 | "sector=%lu, size=%u, offset=%u\n", | |
715 | (ulong)(bio->bi_sector), bio->bi_size, | |
716 | bio->bi_io_vec[0].bv_offset); | |
717 | ||
718 | /* Do nothing. Just return success */ | |
719 | set_bit(BIO_UPTODATE, &bio->bi_flags); | |
720 | bio_endio(bio, 0); | |
721 | return 0; | |
722 | } | |
723 | ||
724 | static int ramzswap_read(struct ramzswap *rzs, struct bio *bio) | |
725 | { | |
726 | int ret; | |
727 | u32 index; | |
728 | size_t clen; | |
729 | struct page *page; | |
730 | struct zobj_header *zheader; | |
731 | unsigned char *user_mem, *cmem; | |
732 | ||
6a907728 | 733 | rzs_stat64_inc(rzs, &rzs->stats.num_reads); |
306b0c95 NG |
734 | |
735 | page = bio->bi_io_vec[0].bv_page; | |
736 | index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT; | |
737 | ||
738 | if (rzs_test_flag(rzs, index, RZS_ZERO)) | |
739 | return handle_zero_page(bio); | |
740 | ||
741 | /* Requested page is not present in compressed area */ | |
742 | if (!rzs->table[index].page) | |
743 | return handle_ramzswap_fault(rzs, bio); | |
744 | ||
ef4ffb7a | 745 | /* Page is stored uncompressed since it's incompressible */ |
306b0c95 NG |
746 | if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED))) |
747 | return handle_uncompressed_page(rzs, bio); | |
748 | ||
749 | user_mem = kmap_atomic(page, KM_USER0); | |
750 | clen = PAGE_SIZE; | |
751 | ||
752 | cmem = kmap_atomic(rzs->table[index].page, KM_USER1) + | |
753 | rzs->table[index].offset; | |
754 | ||
755 | ret = lzo1x_decompress_safe( | |
756 | cmem + sizeof(*zheader), | |
757 | xv_get_object_size(cmem) - sizeof(*zheader), | |
758 | user_mem, &clen); | |
759 | ||
760 | kunmap_atomic(user_mem, KM_USER0); | |
761 | kunmap_atomic(cmem, KM_USER1); | |
762 | ||
763 | /* should NEVER happen */ | |
764 | if (unlikely(ret != LZO_E_OK)) { | |
765 | pr_err("Decompression failed! err=%d, page=%u\n", | |
766 | ret, index); | |
6a907728 | 767 | rzs_stat64_inc(rzs, &rzs->stats.failed_reads); |
306b0c95 NG |
768 | goto out; |
769 | } | |
770 | ||
30fb8a71 | 771 | flush_dcache_page(page); |
306b0c95 NG |
772 | |
773 | set_bit(BIO_UPTODATE, &bio->bi_flags); | |
774 | bio_endio(bio, 0); | |
775 | return 0; | |
776 | ||
777 | out: | |
778 | bio_io_error(bio); | |
779 | return 0; | |
780 | } | |
781 | ||
782 | static int ramzswap_write(struct ramzswap *rzs, struct bio *bio) | |
783 | { | |
784 | int ret, fwd_write_request = 0; | |
785 | u32 offset, index; | |
786 | size_t clen; | |
787 | struct zobj_header *zheader; | |
788 | struct page *page, *page_store; | |
789 | unsigned char *user_mem, *cmem, *src; | |
790 | ||
6a907728 | 791 | rzs_stat64_inc(rzs, &rzs->stats.num_writes); |
306b0c95 NG |
792 | |
793 | page = bio->bi_io_vec[0].bv_page; | |
794 | index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT; | |
795 | ||
796 | src = rzs->compress_buffer; | |
797 | ||
798 | /* | |
799 | * System swaps to same sector again when the stored page | |
800 | * is no longer referenced by any process. So, its now safe | |
801 | * to free the memory that was allocated for this page. | |
802 | */ | |
2e882281 | 803 | if (rzs->table[index].page || rzs_test_flag(rzs, index, RZS_ZERO)) |
306b0c95 NG |
804 | ramzswap_free_page(rzs, index); |
805 | ||
306b0c95 NG |
806 | mutex_lock(&rzs->lock); |
807 | ||
808 | user_mem = kmap_atomic(page, KM_USER0); | |
809 | if (page_zero_filled(user_mem)) { | |
810 | kunmap_atomic(user_mem, KM_USER0); | |
811 | mutex_unlock(&rzs->lock); | |
6a907728 | 812 | rzs_stat_inc(&rzs->stats.pages_zero); |
306b0c95 NG |
813 | rzs_set_flag(rzs, index, RZS_ZERO); |
814 | ||
815 | set_bit(BIO_UPTODATE, &bio->bi_flags); | |
816 | bio_endio(bio, 0); | |
817 | return 0; | |
818 | } | |
819 | ||
820 | if (rzs->backing_swap && | |
821 | (rzs->stats.compr_size > rzs->memlimit - PAGE_SIZE)) { | |
822 | kunmap_atomic(user_mem, KM_USER0); | |
823 | mutex_unlock(&rzs->lock); | |
824 | fwd_write_request = 1; | |
825 | goto out; | |
826 | } | |
827 | ||
828 | ret = lzo1x_1_compress(user_mem, PAGE_SIZE, src, &clen, | |
829 | rzs->compress_workmem); | |
830 | ||
831 | kunmap_atomic(user_mem, KM_USER0); | |
832 | ||
833 | if (unlikely(ret != LZO_E_OK)) { | |
834 | mutex_unlock(&rzs->lock); | |
835 | pr_err("Compression failed! err=%d\n", ret); | |
6a907728 | 836 | rzs_stat64_inc(rzs, &rzs->stats.failed_writes); |
306b0c95 NG |
837 | goto out; |
838 | } | |
839 | ||
840 | /* | |
841 | * Page is incompressible. Forward it to backing swap | |
842 | * if present. Otherwise, store it as-is (uncompressed) | |
843 | * since we do not want to return too many swap write | |
844 | * errors which has side effect of hanging the system. | |
845 | */ | |
846 | if (unlikely(clen > max_zpage_size)) { | |
847 | if (rzs->backing_swap) { | |
848 | mutex_unlock(&rzs->lock); | |
849 | fwd_write_request = 1; | |
850 | goto out; | |
851 | } | |
852 | ||
853 | clen = PAGE_SIZE; | |
854 | page_store = alloc_page(GFP_NOIO | __GFP_HIGHMEM); | |
855 | if (unlikely(!page_store)) { | |
856 | mutex_unlock(&rzs->lock); | |
857 | pr_info("Error allocating memory for incompressible " | |
858 | "page: %u\n", index); | |
6a907728 | 859 | rzs_stat64_inc(rzs, &rzs->stats.failed_writes); |
306b0c95 NG |
860 | goto out; |
861 | } | |
862 | ||
863 | offset = 0; | |
864 | rzs_set_flag(rzs, index, RZS_UNCOMPRESSED); | |
6a907728 | 865 | rzs_stat_inc(&rzs->stats.pages_expand); |
306b0c95 NG |
866 | rzs->table[index].page = page_store; |
867 | src = kmap_atomic(page, KM_USER0); | |
868 | goto memstore; | |
869 | } | |
870 | ||
871 | if (xv_malloc(rzs->mem_pool, clen + sizeof(*zheader), | |
872 | &rzs->table[index].page, &offset, | |
873 | GFP_NOIO | __GFP_HIGHMEM)) { | |
874 | mutex_unlock(&rzs->lock); | |
875 | pr_info("Error allocating memory for compressed " | |
876 | "page: %u, size=%zu\n", index, clen); | |
6a907728 | 877 | rzs_stat64_inc(rzs, &rzs->stats.failed_writes); |
306b0c95 NG |
878 | if (rzs->backing_swap) |
879 | fwd_write_request = 1; | |
880 | goto out; | |
881 | } | |
882 | ||
883 | memstore: | |
884 | rzs->table[index].offset = offset; | |
885 | ||
886 | cmem = kmap_atomic(rzs->table[index].page, KM_USER1) + | |
887 | rzs->table[index].offset; | |
888 | ||
889 | #if 0 | |
890 | /* Back-reference needed for memory defragmentation */ | |
891 | if (!rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)) { | |
892 | zheader = (struct zobj_header *)cmem; | |
893 | zheader->table_idx = index; | |
894 | cmem += sizeof(*zheader); | |
895 | } | |
896 | #endif | |
897 | ||
898 | memcpy(cmem, src, clen); | |
899 | ||
900 | kunmap_atomic(cmem, KM_USER1); | |
901 | if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED))) | |
902 | kunmap_atomic(src, KM_USER0); | |
903 | ||
904 | /* Update stats */ | |
905 | rzs->stats.compr_size += clen; | |
6a907728 | 906 | rzs_stat_inc(&rzs->stats.pages_stored); |
306b0c95 | 907 | if (clen <= PAGE_SIZE / 2) |
6a907728 | 908 | rzs_stat_inc(&rzs->stats.good_compress); |
306b0c95 NG |
909 | |
910 | mutex_unlock(&rzs->lock); | |
911 | ||
912 | set_bit(BIO_UPTODATE, &bio->bi_flags); | |
913 | bio_endio(bio, 0); | |
914 | return 0; | |
915 | ||
916 | out: | |
917 | if (fwd_write_request) { | |
6a907728 | 918 | rzs_stat64_inc(rzs, &rzs->stats.bdev_num_writes); |
306b0c95 NG |
919 | bio->bi_bdev = rzs->backing_swap; |
920 | #if 0 | |
921 | /* | |
922 | * TODO: We currently have linear mapping of ramzswap and | |
923 | * backing swap sectors. This is not desired since we want | |
924 | * to optimize writes to backing swap to minimize disk seeks | |
925 | * or have effective wear leveling (for SSDs). Also, a | |
926 | * non-linear mapping is required to implement compressed | |
927 | * on-disk swapping. | |
928 | */ | |
929 | bio->bi_sector = get_backing_swap_page() | |
930 | << SECTORS_PER_PAGE_SHIFT; | |
931 | #endif | |
932 | /* | |
933 | * In case backing swap is a file, find the right offset within | |
934 | * the file corresponding to logical position 'index'. For block | |
935 | * device, this is a nop. | |
936 | */ | |
937 | bio->bi_sector = map_backing_swap_page(rzs, index) | |
938 | << SECTORS_PER_PAGE_SHIFT; | |
939 | return 1; | |
940 | } | |
941 | ||
942 | bio_io_error(bio); | |
943 | return 0; | |
944 | } | |
945 | ||
306b0c95 NG |
946 | /* |
947 | * Check if request is within bounds and page aligned. | |
948 | */ | |
949 | static inline int valid_swap_request(struct ramzswap *rzs, struct bio *bio) | |
950 | { | |
951 | if (unlikely( | |
952 | (bio->bi_sector >= (rzs->disksize >> SECTOR_SHIFT)) || | |
953 | (bio->bi_sector & (SECTORS_PER_PAGE - 1)) || | |
954 | (bio->bi_vcnt != 1) || | |
955 | (bio->bi_size != PAGE_SIZE) || | |
956 | (bio->bi_io_vec[0].bv_offset != 0))) { | |
957 | ||
958 | return 0; | |
959 | } | |
960 | ||
961 | /* swap request is valid */ | |
962 | return 1; | |
963 | } | |
964 | ||
965 | /* | |
966 | * Handler function for all ramzswap I/O requests. | |
967 | */ | |
968 | static int ramzswap_make_request(struct request_queue *queue, struct bio *bio) | |
969 | { | |
970 | int ret = 0; | |
971 | struct ramzswap *rzs = queue->queuedata; | |
972 | ||
973 | if (unlikely(!rzs->init_done)) { | |
974 | bio_io_error(bio); | |
975 | return 0; | |
976 | } | |
977 | ||
978 | if (!valid_swap_request(rzs, bio)) { | |
6a907728 | 979 | rzs_stat64_inc(rzs, &rzs->stats.invalid_io); |
306b0c95 NG |
980 | bio_io_error(bio); |
981 | return 0; | |
982 | } | |
983 | ||
984 | switch (bio_data_dir(bio)) { | |
985 | case READ: | |
986 | ret = ramzswap_read(rzs, bio); | |
987 | break; | |
988 | ||
989 | case WRITE: | |
990 | ret = ramzswap_write(rzs, bio); | |
991 | break; | |
992 | } | |
993 | ||
994 | return ret; | |
995 | } | |
996 | ||
997 | static void reset_device(struct ramzswap *rzs) | |
998 | { | |
999 | int is_backing_blkdev = 0; | |
1000 | size_t index, num_pages; | |
1001 | unsigned entries_per_page; | |
1002 | unsigned long num_table_pages, entry = 0; | |
1003 | ||
7eef7533 NG |
1004 | /* Do not accept any new I/O request */ |
1005 | rzs->init_done = 0; | |
1006 | ||
306b0c95 NG |
1007 | if (rzs->backing_swap && !rzs->num_extents) |
1008 | is_backing_blkdev = 1; | |
1009 | ||
1010 | num_pages = rzs->disksize >> PAGE_SHIFT; | |
1011 | ||
1012 | /* Free various per-device buffers */ | |
1013 | kfree(rzs->compress_workmem); | |
1014 | free_pages((unsigned long)rzs->compress_buffer, 1); | |
1015 | ||
1016 | rzs->compress_workmem = NULL; | |
1017 | rzs->compress_buffer = NULL; | |
1018 | ||
1019 | /* Free all pages that are still in this ramzswap device */ | |
1020 | for (index = 0; index < num_pages; index++) { | |
1021 | struct page *page; | |
1022 | u16 offset; | |
1023 | ||
1024 | page = rzs->table[index].page; | |
1025 | offset = rzs->table[index].offset; | |
1026 | ||
1027 | if (!page) | |
1028 | continue; | |
1029 | ||
1030 | if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED))) | |
1031 | __free_page(page); | |
1032 | else | |
1033 | xv_free(rzs->mem_pool, page, offset); | |
1034 | } | |
1035 | ||
1036 | entries_per_page = PAGE_SIZE / sizeof(*rzs->table); | |
1037 | num_table_pages = DIV_ROUND_UP(num_pages * sizeof(*rzs->table), | |
1038 | PAGE_SIZE); | |
1039 | /* | |
1040 | * Set page->mapping to NULL for every table page. | |
1041 | * Otherwise, we will hit bad_page() during free. | |
1042 | */ | |
1043 | while (rzs->num_extents && num_table_pages--) { | |
1044 | struct page *page; | |
1045 | page = vmalloc_to_page(&rzs->table[entry]); | |
1046 | page->mapping = NULL; | |
1047 | entry += entries_per_page; | |
1048 | } | |
1049 | vfree(rzs->table); | |
1050 | rzs->table = NULL; | |
1051 | ||
1052 | xv_destroy_pool(rzs->mem_pool); | |
1053 | rzs->mem_pool = NULL; | |
1054 | ||
1055 | /* Free all swap extent pages */ | |
1056 | while (!list_empty(&rzs->backing_swap_extent_list)) { | |
1057 | struct page *page; | |
1058 | struct list_head *entry; | |
1059 | entry = rzs->backing_swap_extent_list.next; | |
1060 | page = list_entry(entry, struct page, lru); | |
1061 | list_del(entry); | |
1062 | __free_page(page); | |
1063 | } | |
1064 | INIT_LIST_HEAD(&rzs->backing_swap_extent_list); | |
1065 | rzs->num_extents = 0; | |
1066 | ||
1067 | /* Close backing swap device, if present */ | |
1068 | if (rzs->backing_swap) { | |
1069 | if (is_backing_blkdev) | |
1070 | bd_release(rzs->backing_swap); | |
1071 | filp_close(rzs->swap_file, NULL); | |
1072 | rzs->backing_swap = NULL; | |
c25d75a2 | 1073 | memset(rzs->backing_swap_name, 0, MAX_SWAP_NAME_LEN); |
306b0c95 NG |
1074 | } |
1075 | ||
1076 | /* Reset stats */ | |
1077 | memset(&rzs->stats, 0, sizeof(rzs->stats)); | |
1078 | ||
1079 | rzs->disksize = 0; | |
1080 | rzs->memlimit = 0; | |
306b0c95 NG |
1081 | } |
1082 | ||
1083 | static int ramzswap_ioctl_init_device(struct ramzswap *rzs) | |
1084 | { | |
1085 | int ret; | |
1086 | size_t num_pages; | |
1087 | struct page *page; | |
1088 | union swap_header *swap_header; | |
1089 | ||
1090 | if (rzs->init_done) { | |
1091 | pr_info("Device already initialized!\n"); | |
1092 | return -EBUSY; | |
1093 | } | |
1094 | ||
1095 | ret = setup_backing_swap(rzs); | |
1096 | if (ret) | |
1097 | goto fail; | |
1098 | ||
1099 | if (rzs->backing_swap) | |
1100 | ramzswap_set_memlimit(rzs, totalram_pages << PAGE_SHIFT); | |
1101 | else | |
1102 | ramzswap_set_disksize(rzs, totalram_pages << PAGE_SHIFT); | |
1103 | ||
1104 | rzs->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); | |
1105 | if (!rzs->compress_workmem) { | |
1106 | pr_err("Error allocating compressor working memory!\n"); | |
1107 | ret = -ENOMEM; | |
1108 | goto fail; | |
1109 | } | |
1110 | ||
1111 | rzs->compress_buffer = (void *)__get_free_pages(__GFP_ZERO, 1); | |
1112 | if (!rzs->compress_buffer) { | |
1113 | pr_err("Error allocating compressor buffer space\n"); | |
1114 | ret = -ENOMEM; | |
1115 | goto fail; | |
1116 | } | |
1117 | ||
1118 | num_pages = rzs->disksize >> PAGE_SHIFT; | |
1119 | rzs->table = vmalloc(num_pages * sizeof(*rzs->table)); | |
1120 | if (!rzs->table) { | |
1121 | pr_err("Error allocating ramzswap address table\n"); | |
1122 | /* To prevent accessing table entries during cleanup */ | |
1123 | rzs->disksize = 0; | |
1124 | ret = -ENOMEM; | |
1125 | goto fail; | |
1126 | } | |
1127 | memset(rzs->table, 0, num_pages * sizeof(*rzs->table)); | |
1128 | ||
1129 | map_backing_swap_extents(rzs); | |
1130 | ||
1131 | page = alloc_page(__GFP_ZERO); | |
1132 | if (!page) { | |
1133 | pr_err("Error allocating swap header page\n"); | |
1134 | ret = -ENOMEM; | |
1135 | goto fail; | |
1136 | } | |
1137 | rzs->table[0].page = page; | |
1138 | rzs_set_flag(rzs, 0, RZS_UNCOMPRESSED); | |
1139 | ||
1140 | swap_header = kmap(page); | |
1141 | ret = setup_swap_header(rzs, swap_header); | |
1142 | kunmap(page); | |
1143 | if (ret) { | |
1144 | pr_err("Error setting swap header\n"); | |
1145 | goto fail; | |
1146 | } | |
1147 | ||
1148 | set_capacity(rzs->disk, rzs->disksize >> SECTOR_SHIFT); | |
1149 | ||
1150 | /* | |
1151 | * We have ident mapping of sectors for ramzswap and | |
1152 | * and the backing swap device. So, this queue flag | |
1153 | * should be according to backing dev. | |
1154 | */ | |
1155 | if (!rzs->backing_swap || | |
1156 | blk_queue_nonrot(rzs->backing_swap->bd_disk->queue)) | |
1157 | queue_flag_set_unlocked(QUEUE_FLAG_NONROT, rzs->disk->queue); | |
1158 | ||
1159 | rzs->mem_pool = xv_create_pool(); | |
1160 | if (!rzs->mem_pool) { | |
1161 | pr_err("Error creating memory pool\n"); | |
1162 | ret = -ENOMEM; | |
1163 | goto fail; | |
1164 | } | |
1165 | ||
1166 | /* | |
1167 | * Pages that compress to size greater than this are forwarded | |
1168 | * to physical swap disk (if backing dev is provided) | |
1169 | * TODO: make this configurable | |
1170 | */ | |
1171 | if (rzs->backing_swap) | |
1172 | max_zpage_size = max_zpage_size_bdev; | |
1173 | else | |
1174 | max_zpage_size = max_zpage_size_nobdev; | |
1175 | pr_debug("Max compressed page size: %u bytes\n", max_zpage_size); | |
1176 | ||
1177 | rzs->init_done = 1; | |
1178 | ||
1179 | pr_debug("Initialization done!\n"); | |
1180 | return 0; | |
1181 | ||
1182 | fail: | |
1183 | reset_device(rzs); | |
1184 | ||
1185 | pr_err("Initialization failed: err=%d\n", ret); | |
1186 | return ret; | |
1187 | } | |
1188 | ||
1189 | static int ramzswap_ioctl_reset_device(struct ramzswap *rzs) | |
1190 | { | |
1191 | if (rzs->init_done) | |
1192 | reset_device(rzs); | |
1193 | ||
1194 | return 0; | |
1195 | } | |
1196 | ||
1197 | static int ramzswap_ioctl(struct block_device *bdev, fmode_t mode, | |
1198 | unsigned int cmd, unsigned long arg) | |
1199 | { | |
1200 | int ret = 0; | |
1201 | size_t disksize_kb, memlimit_kb; | |
1202 | ||
1203 | struct ramzswap *rzs = bdev->bd_disk->private_data; | |
1204 | ||
1205 | switch (cmd) { | |
1206 | case RZSIO_SET_DISKSIZE_KB: | |
1207 | if (rzs->init_done) { | |
1208 | ret = -EBUSY; | |
1209 | goto out; | |
1210 | } | |
1211 | if (copy_from_user(&disksize_kb, (void *)arg, | |
1212 | _IOC_SIZE(cmd))) { | |
1213 | ret = -EFAULT; | |
1214 | goto out; | |
1215 | } | |
1216 | rzs->disksize = disksize_kb << 10; | |
1217 | pr_info("Disk size set to %zu kB\n", disksize_kb); | |
1218 | break; | |
1219 | ||
1220 | case RZSIO_SET_MEMLIMIT_KB: | |
1221 | if (rzs->init_done) { | |
1222 | /* TODO: allow changing memlimit */ | |
1223 | ret = -EBUSY; | |
1224 | goto out; | |
1225 | } | |
1226 | if (copy_from_user(&memlimit_kb, (void *)arg, | |
1227 | _IOC_SIZE(cmd))) { | |
1228 | ret = -EFAULT; | |
1229 | goto out; | |
1230 | } | |
1231 | rzs->memlimit = memlimit_kb << 10; | |
1232 | pr_info("Memory limit set to %zu kB\n", memlimit_kb); | |
1233 | break; | |
1234 | ||
1235 | case RZSIO_SET_BACKING_SWAP: | |
1236 | if (rzs->init_done) { | |
1237 | ret = -EBUSY; | |
1238 | goto out; | |
1239 | } | |
1240 | ||
1241 | if (copy_from_user(&rzs->backing_swap_name, (void *)arg, | |
1242 | _IOC_SIZE(cmd))) { | |
1243 | ret = -EFAULT; | |
1244 | goto out; | |
1245 | } | |
1246 | rzs->backing_swap_name[MAX_SWAP_NAME_LEN - 1] = '\0'; | |
1247 | pr_info("Backing swap set to %s\n", rzs->backing_swap_name); | |
1248 | break; | |
1249 | ||
1250 | case RZSIO_GET_STATS: | |
1251 | { | |
1252 | struct ramzswap_ioctl_stats *stats; | |
1253 | if (!rzs->init_done) { | |
1254 | ret = -ENOTTY; | |
1255 | goto out; | |
1256 | } | |
1257 | stats = kzalloc(sizeof(*stats), GFP_KERNEL); | |
1258 | if (!stats) { | |
1259 | ret = -ENOMEM; | |
1260 | goto out; | |
1261 | } | |
1262 | ramzswap_ioctl_get_stats(rzs, stats); | |
1263 | if (copy_to_user((void *)arg, stats, sizeof(*stats))) { | |
1264 | kfree(stats); | |
1265 | ret = -EFAULT; | |
1266 | goto out; | |
1267 | } | |
1268 | kfree(stats); | |
1269 | break; | |
1270 | } | |
1271 | case RZSIO_INIT: | |
1272 | ret = ramzswap_ioctl_init_device(rzs); | |
1273 | break; | |
1274 | ||
1275 | case RZSIO_RESET: | |
1276 | /* Do not reset an active device! */ | |
1277 | if (bdev->bd_holders) { | |
1278 | ret = -EBUSY; | |
1279 | goto out; | |
1280 | } | |
7eef7533 NG |
1281 | |
1282 | /* Make sure all pending I/O is finished */ | |
1283 | if (bdev) | |
1284 | fsync_bdev(bdev); | |
1285 | ||
306b0c95 NG |
1286 | ret = ramzswap_ioctl_reset_device(rzs); |
1287 | break; | |
1288 | ||
1289 | default: | |
1290 | pr_info("Invalid ioctl %u\n", cmd); | |
1291 | ret = -ENOTTY; | |
1292 | } | |
1293 | ||
1294 | out: | |
1295 | return ret; | |
1296 | } | |
1297 | ||
1298 | static struct block_device_operations ramzswap_devops = { | |
1299 | .ioctl = ramzswap_ioctl, | |
1300 | .owner = THIS_MODULE, | |
1301 | }; | |
1302 | ||
3bf040c7 | 1303 | static int create_device(struct ramzswap *rzs, int device_id) |
306b0c95 | 1304 | { |
de1a21a0 NG |
1305 | int ret = 0; |
1306 | ||
306b0c95 | 1307 | mutex_init(&rzs->lock); |
6a907728 | 1308 | spin_lock_init(&rzs->stat64_lock); |
306b0c95 NG |
1309 | INIT_LIST_HEAD(&rzs->backing_swap_extent_list); |
1310 | ||
1311 | rzs->queue = blk_alloc_queue(GFP_KERNEL); | |
1312 | if (!rzs->queue) { | |
1313 | pr_err("Error allocating disk queue for device %d\n", | |
1314 | device_id); | |
de1a21a0 NG |
1315 | ret = -ENOMEM; |
1316 | goto out; | |
306b0c95 NG |
1317 | } |
1318 | ||
1319 | blk_queue_make_request(rzs->queue, ramzswap_make_request); | |
1320 | rzs->queue->queuedata = rzs; | |
1321 | ||
1322 | /* gendisk structure */ | |
1323 | rzs->disk = alloc_disk(1); | |
1324 | if (!rzs->disk) { | |
1325 | blk_cleanup_queue(rzs->queue); | |
1326 | pr_warning("Error allocating disk structure for device %d\n", | |
1327 | device_id); | |
de1a21a0 NG |
1328 | ret = -ENOMEM; |
1329 | goto out; | |
306b0c95 NG |
1330 | } |
1331 | ||
1332 | rzs->disk->major = ramzswap_major; | |
1333 | rzs->disk->first_minor = device_id; | |
1334 | rzs->disk->fops = &ramzswap_devops; | |
1335 | rzs->disk->queue = rzs->queue; | |
1336 | rzs->disk->private_data = rzs; | |
1337 | snprintf(rzs->disk->disk_name, 16, "ramzswap%d", device_id); | |
1338 | ||
1339 | /* | |
1340 | * Actual capacity set using RZSIO_SET_DISKSIZE_KB ioctl | |
1341 | * or set equal to backing swap device (if provided) | |
1342 | */ | |
1343 | set_capacity(rzs->disk, 0); | |
5d83d5a0 NG |
1344 | |
1345 | blk_queue_physical_block_size(rzs->disk->queue, PAGE_SIZE); | |
1346 | blk_queue_logical_block_size(rzs->disk->queue, PAGE_SIZE); | |
1347 | ||
306b0c95 NG |
1348 | add_disk(rzs->disk); |
1349 | ||
1350 | rzs->init_done = 0; | |
de1a21a0 NG |
1351 | |
1352 | out: | |
1353 | return ret; | |
306b0c95 NG |
1354 | } |
1355 | ||
1356 | static void destroy_device(struct ramzswap *rzs) | |
1357 | { | |
1358 | if (rzs->disk) { | |
1359 | del_gendisk(rzs->disk); | |
1360 | put_disk(rzs->disk); | |
1361 | } | |
1362 | ||
1363 | if (rzs->queue) | |
1364 | blk_cleanup_queue(rzs->queue); | |
1365 | } | |
1366 | ||
1367 | static int __init ramzswap_init(void) | |
1368 | { | |
de1a21a0 | 1369 | int ret, dev_id; |
306b0c95 NG |
1370 | |
1371 | if (num_devices > max_num_devices) { | |
1372 | pr_warning("Invalid value for num_devices: %u\n", | |
1373 | num_devices); | |
de1a21a0 NG |
1374 | ret = -EINVAL; |
1375 | goto out; | |
306b0c95 NG |
1376 | } |
1377 | ||
1378 | ramzswap_major = register_blkdev(0, "ramzswap"); | |
1379 | if (ramzswap_major <= 0) { | |
1380 | pr_warning("Unable to get major number\n"); | |
de1a21a0 NG |
1381 | ret = -EBUSY; |
1382 | goto out; | |
306b0c95 NG |
1383 | } |
1384 | ||
1385 | if (!num_devices) { | |
1386 | pr_info("num_devices not specified. Using default: 1\n"); | |
1387 | num_devices = 1; | |
1388 | } | |
1389 | ||
1390 | /* Allocate the device array and initialize each one */ | |
1391 | pr_info("Creating %u devices ...\n", num_devices); | |
1392 | devices = kzalloc(num_devices * sizeof(struct ramzswap), GFP_KERNEL); | |
de1a21a0 NG |
1393 | if (!devices) { |
1394 | ret = -ENOMEM; | |
1395 | goto unregister; | |
1396 | } | |
306b0c95 | 1397 | |
de1a21a0 NG |
1398 | for (dev_id = 0; dev_id < num_devices; dev_id++) { |
1399 | ret = create_device(&devices[dev_id], dev_id); | |
1400 | if (ret) | |
3bf040c7 | 1401 | goto free_devices; |
de1a21a0 NG |
1402 | } |
1403 | ||
306b0c95 | 1404 | return 0; |
de1a21a0 | 1405 | |
3bf040c7 | 1406 | free_devices: |
de1a21a0 NG |
1407 | while (dev_id) |
1408 | destroy_device(&devices[--dev_id]); | |
1409 | unregister: | |
306b0c95 | 1410 | unregister_blkdev(ramzswap_major, "ramzswap"); |
de1a21a0 | 1411 | out: |
306b0c95 NG |
1412 | return ret; |
1413 | } | |
1414 | ||
1415 | static void __exit ramzswap_exit(void) | |
1416 | { | |
1417 | int i; | |
1418 | struct ramzswap *rzs; | |
1419 | ||
1420 | for (i = 0; i < num_devices; i++) { | |
1421 | rzs = &devices[i]; | |
1422 | ||
1423 | destroy_device(rzs); | |
1424 | if (rzs->init_done) | |
1425 | reset_device(rzs); | |
1426 | } | |
1427 | ||
1428 | unregister_blkdev(ramzswap_major, "ramzswap"); | |
1429 | ||
1430 | kfree(devices); | |
1431 | pr_debug("Cleanup done!\n"); | |
1432 | } | |
1433 | ||
1434 | module_param(num_devices, uint, 0); | |
1435 | MODULE_PARM_DESC(num_devices, "Number of ramzswap devices"); | |
1436 | ||
1437 | module_init(ramzswap_init); | |
1438 | module_exit(ramzswap_exit); | |
1439 | ||
1440 | MODULE_LICENSE("Dual BSD/GPL"); | |
1441 | MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>"); | |
1442 | MODULE_DESCRIPTION("Compressed RAM Based Swap Device"); |