2 * Compressed RAM based swap device
4 * Copyright (C) 2008, 2009, 2010 Nitin Gupta
6 * This code is released using a dual license strategy: BSD/GPL
7 * You can choose the licence that better fits your requirements.
9 * Released under the terms of 3-clause BSD License
10 * Released under the terms of GNU General Public License Version 2.0
12 * Project home: http://compcache.googlecode.com
15 #define KMSG_COMPONENT "ramzswap"
16 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
18 #include <linux/module.h>
19 #include <linux/kernel.h>
20 #include <linux/bitops.h>
21 #include <linux/blkdev.h>
22 #include <linux/buffer_head.h>
23 #include <linux/device.h>
24 #include <linux/genhd.h>
25 #include <linux/highmem.h>
26 #include <linux/slab.h>
27 #include <linux/lzo.h>
28 #include <linux/string.h>
29 #include <linux/swap.h>
30 #include <linux/swapops.h>
31 #include <linux/vmalloc.h>
33 #include "ramzswap_drv.h"
36 static int ramzswap_major
;
37 static struct ramzswap
*devices
;
39 /* Module params (documentation at end) */
40 static unsigned int num_devices
;
42 static int rzs_test_flag(struct ramzswap
*rzs
, u32 index
,
43 enum rzs_pageflags flag
)
45 return rzs
->table
[index
].flags
& BIT(flag
);
48 static void rzs_set_flag(struct ramzswap
*rzs
, u32 index
,
49 enum rzs_pageflags flag
)
51 rzs
->table
[index
].flags
|= BIT(flag
);
54 static void rzs_clear_flag(struct ramzswap
*rzs
, u32 index
,
55 enum rzs_pageflags flag
)
57 rzs
->table
[index
].flags
&= ~BIT(flag
);
60 static int page_zero_filled(void *ptr
)
65 page
= (unsigned long *)ptr
;
67 for (pos
= 0; pos
!= PAGE_SIZE
/ sizeof(*page
); pos
++) {
75 static void ramzswap_set_disksize(struct ramzswap
*rzs
, size_t totalram_bytes
)
79 "disk size not provided. You can use disksize_kb module "
80 "param to specify size.\nUsing default: (%u%% of RAM).\n",
81 default_disksize_perc_ram
83 rzs
->disksize
= default_disksize_perc_ram
*
84 (totalram_bytes
/ 100);
87 if (rzs
->disksize
> 2 * (totalram_bytes
)) {
89 "There is little point creating a ramzswap of greater than "
90 "twice the size of memory since we expect a 2:1 compression "
91 "ratio. Note that ramzswap uses about 0.1%% of the size of "
92 "the swap device when not in use so a huge ramzswap is "
94 "\tMemory Size: %zu kB\n"
95 "\tSize you selected: %zu kB\n"
96 "Continuing anyway ...\n",
97 totalram_bytes
>> 10, rzs
->disksize
101 rzs
->disksize
&= PAGE_MASK
;
105 * Swap header (1st page of swap device) contains information
106 * about a swap file/partition. Prepare such a header for the
107 * given ramzswap device so that swapon can identify it as a
110 static void setup_swap_header(struct ramzswap
*rzs
, union swap_header
*s
)
113 s
->info
.last_page
= (rzs
->disksize
>> PAGE_SHIFT
) - 1;
114 s
->info
.nr_badpages
= 0;
115 memcpy(s
->magic
.magic
, "SWAPSPACE2", 10);
118 static void ramzswap_ioctl_get_stats(struct ramzswap
*rzs
,
119 struct ramzswap_ioctl_stats
*s
)
121 s
->disksize
= rzs
->disksize
;
123 #if defined(CONFIG_RAMZSWAP_STATS)
125 struct ramzswap_stats
*rs
= &rzs
->stats
;
126 size_t succ_writes
, mem_used
;
127 unsigned int good_compress_perc
= 0, no_compress_perc
= 0;
129 mem_used
= xv_get_total_size_bytes(rzs
->mem_pool
)
130 + (rs
->pages_expand
<< PAGE_SHIFT
);
131 succ_writes
= rzs_stat64_read(rzs
, &rs
->num_writes
) -
132 rzs_stat64_read(rzs
, &rs
->failed_writes
);
134 if (succ_writes
&& rs
->pages_stored
) {
135 good_compress_perc
= rs
->good_compress
* 100
137 no_compress_perc
= rs
->pages_expand
* 100
141 s
->num_reads
= rzs_stat64_read(rzs
, &rs
->num_reads
);
142 s
->num_writes
= rzs_stat64_read(rzs
, &rs
->num_writes
);
143 s
->failed_reads
= rzs_stat64_read(rzs
, &rs
->failed_reads
);
144 s
->failed_writes
= rzs_stat64_read(rzs
, &rs
->failed_writes
);
145 s
->invalid_io
= rzs_stat64_read(rzs
, &rs
->invalid_io
);
146 s
->notify_free
= rzs_stat64_read(rzs
, &rs
->notify_free
);
147 s
->pages_zero
= rs
->pages_zero
;
149 s
->good_compress_pct
= good_compress_perc
;
150 s
->pages_expand_pct
= no_compress_perc
;
152 s
->pages_stored
= rs
->pages_stored
;
153 s
->pages_used
= mem_used
>> PAGE_SHIFT
;
154 s
->orig_data_size
= rs
->pages_stored
<< PAGE_SHIFT
;
155 s
->compr_data_size
= rs
->compr_size
;
156 s
->mem_used_total
= mem_used
;
158 #endif /* CONFIG_RAMZSWAP_STATS */
161 static void ramzswap_free_page(struct ramzswap
*rzs
, size_t index
)
166 struct page
*page
= rzs
->table
[index
].page
;
167 u32 offset
= rzs
->table
[index
].offset
;
169 if (unlikely(!page
)) {
171 * No memory is allocated for zero filled pages.
172 * Simply clear zero page flag.
174 if (rzs_test_flag(rzs
, index
, RZS_ZERO
)) {
175 rzs_clear_flag(rzs
, index
, RZS_ZERO
);
176 rzs_stat_dec(&rzs
->stats
.pages_zero
);
181 if (unlikely(rzs_test_flag(rzs
, index
, RZS_UNCOMPRESSED
))) {
184 rzs_clear_flag(rzs
, index
, RZS_UNCOMPRESSED
);
185 rzs_stat_dec(&rzs
->stats
.pages_expand
);
189 obj
= kmap_atomic(page
, KM_USER0
) + offset
;
190 clen
= xv_get_object_size(obj
) - sizeof(struct zobj_header
);
191 kunmap_atomic(obj
, KM_USER0
);
193 xv_free(rzs
->mem_pool
, page
, offset
);
194 if (clen
<= PAGE_SIZE
/ 2)
195 rzs_stat_dec(&rzs
->stats
.good_compress
);
198 rzs
->stats
.compr_size
-= clen
;
199 rzs_stat_dec(&rzs
->stats
.pages_stored
);
201 rzs
->table
[index
].page
= NULL
;
202 rzs
->table
[index
].offset
= 0;
205 static int handle_zero_page(struct bio
*bio
)
208 struct page
*page
= bio
->bi_io_vec
[0].bv_page
;
210 user_mem
= kmap_atomic(page
, KM_USER0
);
211 memset(user_mem
, 0, PAGE_SIZE
);
212 kunmap_atomic(user_mem
, KM_USER0
);
214 flush_dcache_page(page
);
216 set_bit(BIO_UPTODATE
, &bio
->bi_flags
);
221 static int handle_uncompressed_page(struct ramzswap
*rzs
, struct bio
*bio
)
225 unsigned char *user_mem
, *cmem
;
227 page
= bio
->bi_io_vec
[0].bv_page
;
228 index
= bio
->bi_sector
>> SECTORS_PER_PAGE_SHIFT
;
230 user_mem
= kmap_atomic(page
, KM_USER0
);
231 cmem
= kmap_atomic(rzs
->table
[index
].page
, KM_USER1
) +
232 rzs
->table
[index
].offset
;
234 memcpy(user_mem
, cmem
, PAGE_SIZE
);
235 kunmap_atomic(user_mem
, KM_USER0
);
236 kunmap_atomic(cmem
, KM_USER1
);
238 flush_dcache_page(page
);
240 set_bit(BIO_UPTODATE
, &bio
->bi_flags
);
246 * Called when request page is not present in ramzswap.
247 * This is an attempt to read before any previous write
248 * to this location - this happens due to readahead when
249 * swap device is read from user-space (e.g. during swapon)
251 static int handle_ramzswap_fault(struct ramzswap
*rzs
, struct bio
*bio
)
253 pr_debug("Read before write on swap device: "
254 "sector=%lu, size=%u, offset=%u\n",
255 (ulong
)(bio
->bi_sector
), bio
->bi_size
,
256 bio
->bi_io_vec
[0].bv_offset
);
258 /* Do nothing. Just return success */
259 set_bit(BIO_UPTODATE
, &bio
->bi_flags
);
264 static int ramzswap_read(struct ramzswap
*rzs
, struct bio
*bio
)
270 struct zobj_header
*zheader
;
271 unsigned char *user_mem
, *cmem
;
273 rzs_stat64_inc(rzs
, &rzs
->stats
.num_reads
);
275 page
= bio
->bi_io_vec
[0].bv_page
;
276 index
= bio
->bi_sector
>> SECTORS_PER_PAGE_SHIFT
;
278 if (rzs_test_flag(rzs
, index
, RZS_ZERO
))
279 return handle_zero_page(bio
);
281 /* Requested page is not present in compressed area */
282 if (!rzs
->table
[index
].page
)
283 return handle_ramzswap_fault(rzs
, bio
);
285 /* Page is stored uncompressed since it's incompressible */
286 if (unlikely(rzs_test_flag(rzs
, index
, RZS_UNCOMPRESSED
)))
287 return handle_uncompressed_page(rzs
, bio
);
289 user_mem
= kmap_atomic(page
, KM_USER0
);
292 cmem
= kmap_atomic(rzs
->table
[index
].page
, KM_USER1
) +
293 rzs
->table
[index
].offset
;
295 ret
= lzo1x_decompress_safe(
296 cmem
+ sizeof(*zheader
),
297 xv_get_object_size(cmem
) - sizeof(*zheader
),
300 kunmap_atomic(user_mem
, KM_USER0
);
301 kunmap_atomic(cmem
, KM_USER1
);
303 /* should NEVER happen */
304 if (unlikely(ret
!= LZO_E_OK
)) {
305 pr_err("Decompression failed! err=%d, page=%u\n",
307 rzs_stat64_inc(rzs
, &rzs
->stats
.failed_reads
);
311 flush_dcache_page(page
);
313 set_bit(BIO_UPTODATE
, &bio
->bi_flags
);
322 static int ramzswap_write(struct ramzswap
*rzs
, struct bio
*bio
)
327 struct zobj_header
*zheader
;
328 struct page
*page
, *page_store
;
329 unsigned char *user_mem
, *cmem
, *src
;
331 rzs_stat64_inc(rzs
, &rzs
->stats
.num_writes
);
333 page
= bio
->bi_io_vec
[0].bv_page
;
334 index
= bio
->bi_sector
>> SECTORS_PER_PAGE_SHIFT
;
336 src
= rzs
->compress_buffer
;
339 * System swaps to same sector again when the stored page
340 * is no longer referenced by any process. So, its now safe
341 * to free the memory that was allocated for this page.
343 if (rzs
->table
[index
].page
|| rzs_test_flag(rzs
, index
, RZS_ZERO
))
344 ramzswap_free_page(rzs
, index
);
346 mutex_lock(&rzs
->lock
);
348 user_mem
= kmap_atomic(page
, KM_USER0
);
349 if (page_zero_filled(user_mem
)) {
350 kunmap_atomic(user_mem
, KM_USER0
);
351 mutex_unlock(&rzs
->lock
);
352 rzs_stat_inc(&rzs
->stats
.pages_zero
);
353 rzs_set_flag(rzs
, index
, RZS_ZERO
);
355 set_bit(BIO_UPTODATE
, &bio
->bi_flags
);
360 ret
= lzo1x_1_compress(user_mem
, PAGE_SIZE
, src
, &clen
,
361 rzs
->compress_workmem
);
363 kunmap_atomic(user_mem
, KM_USER0
);
365 if (unlikely(ret
!= LZO_E_OK
)) {
366 mutex_unlock(&rzs
->lock
);
367 pr_err("Compression failed! err=%d\n", ret
);
368 rzs_stat64_inc(rzs
, &rzs
->stats
.failed_writes
);
373 * Page is incompressible. Store it as-is (uncompressed)
374 * since we do not want to return too many swap write
375 * errors which has side effect of hanging the system.
377 if (unlikely(clen
> max_zpage_size
)) {
379 page_store
= alloc_page(GFP_NOIO
| __GFP_HIGHMEM
);
380 if (unlikely(!page_store
)) {
381 mutex_unlock(&rzs
->lock
);
382 pr_info("Error allocating memory for incompressible "
383 "page: %u\n", index
);
384 rzs_stat64_inc(rzs
, &rzs
->stats
.failed_writes
);
389 rzs_set_flag(rzs
, index
, RZS_UNCOMPRESSED
);
390 rzs_stat_inc(&rzs
->stats
.pages_expand
);
391 rzs
->table
[index
].page
= page_store
;
392 src
= kmap_atomic(page
, KM_USER0
);
396 if (xv_malloc(rzs
->mem_pool
, clen
+ sizeof(*zheader
),
397 &rzs
->table
[index
].page
, &offset
,
398 GFP_NOIO
| __GFP_HIGHMEM
)) {
399 mutex_unlock(&rzs
->lock
);
400 pr_info("Error allocating memory for compressed "
401 "page: %u, size=%zu\n", index
, clen
);
402 rzs_stat64_inc(rzs
, &rzs
->stats
.failed_writes
);
407 rzs
->table
[index
].offset
= offset
;
409 cmem
= kmap_atomic(rzs
->table
[index
].page
, KM_USER1
) +
410 rzs
->table
[index
].offset
;
413 /* Back-reference needed for memory defragmentation */
414 if (!rzs_test_flag(rzs
, index
, RZS_UNCOMPRESSED
)) {
415 zheader
= (struct zobj_header
*)cmem
;
416 zheader
->table_idx
= index
;
417 cmem
+= sizeof(*zheader
);
421 memcpy(cmem
, src
, clen
);
423 kunmap_atomic(cmem
, KM_USER1
);
424 if (unlikely(rzs_test_flag(rzs
, index
, RZS_UNCOMPRESSED
)))
425 kunmap_atomic(src
, KM_USER0
);
428 rzs
->stats
.compr_size
+= clen
;
429 rzs_stat_inc(&rzs
->stats
.pages_stored
);
430 if (clen
<= PAGE_SIZE
/ 2)
431 rzs_stat_inc(&rzs
->stats
.good_compress
);
433 mutex_unlock(&rzs
->lock
);
435 set_bit(BIO_UPTODATE
, &bio
->bi_flags
);
445 * Check if request is within bounds and page aligned.
447 static inline int valid_swap_request(struct ramzswap
*rzs
, struct bio
*bio
)
450 (bio
->bi_sector
>= (rzs
->disksize
>> SECTOR_SHIFT
)) ||
451 (bio
->bi_sector
& (SECTORS_PER_PAGE
- 1)) ||
452 (bio
->bi_vcnt
!= 1) ||
453 (bio
->bi_size
!= PAGE_SIZE
) ||
454 (bio
->bi_io_vec
[0].bv_offset
!= 0))) {
459 /* swap request is valid */
464 * Handler function for all ramzswap I/O requests.
466 static int ramzswap_make_request(struct request_queue
*queue
, struct bio
*bio
)
469 struct ramzswap
*rzs
= queue
->queuedata
;
471 if (unlikely(!rzs
->init_done
)) {
476 if (!valid_swap_request(rzs
, bio
)) {
477 rzs_stat64_inc(rzs
, &rzs
->stats
.invalid_io
);
482 switch (bio_data_dir(bio
)) {
484 ret
= ramzswap_read(rzs
, bio
);
488 ret
= ramzswap_write(rzs
, bio
);
495 static void reset_device(struct ramzswap
*rzs
)
499 /* Do not accept any new I/O request */
502 /* Free various per-device buffers */
503 kfree(rzs
->compress_workmem
);
504 free_pages((unsigned long)rzs
->compress_buffer
, 1);
506 rzs
->compress_workmem
= NULL
;
507 rzs
->compress_buffer
= NULL
;
509 /* Free all pages that are still in this ramzswap device */
510 for (index
= 0; index
< rzs
->disksize
>> PAGE_SHIFT
; index
++) {
514 page
= rzs
->table
[index
].page
;
515 offset
= rzs
->table
[index
].offset
;
520 if (unlikely(rzs_test_flag(rzs
, index
, RZS_UNCOMPRESSED
)))
523 xv_free(rzs
->mem_pool
, page
, offset
);
529 xv_destroy_pool(rzs
->mem_pool
);
530 rzs
->mem_pool
= NULL
;
533 memset(&rzs
->stats
, 0, sizeof(rzs
->stats
));
538 static int ramzswap_ioctl_init_device(struct ramzswap
*rzs
)
543 union swap_header
*swap_header
;
545 if (rzs
->init_done
) {
546 pr_info("Device already initialized!\n");
550 ramzswap_set_disksize(rzs
, totalram_pages
<< PAGE_SHIFT
);
552 rzs
->compress_workmem
= kzalloc(LZO1X_MEM_COMPRESS
, GFP_KERNEL
);
553 if (!rzs
->compress_workmem
) {
554 pr_err("Error allocating compressor working memory!\n");
559 rzs
->compress_buffer
= (void *)__get_free_pages(__GFP_ZERO
, 1);
560 if (!rzs
->compress_buffer
) {
561 pr_err("Error allocating compressor buffer space\n");
566 num_pages
= rzs
->disksize
>> PAGE_SHIFT
;
567 rzs
->table
= vmalloc(num_pages
* sizeof(*rzs
->table
));
569 pr_err("Error allocating ramzswap address table\n");
570 /* To prevent accessing table entries during cleanup */
575 memset(rzs
->table
, 0, num_pages
* sizeof(*rzs
->table
));
577 page
= alloc_page(__GFP_ZERO
);
579 pr_err("Error allocating swap header page\n");
583 rzs
->table
[0].page
= page
;
584 rzs_set_flag(rzs
, 0, RZS_UNCOMPRESSED
);
586 swap_header
= kmap(page
);
587 setup_swap_header(rzs
, swap_header
);
590 set_capacity(rzs
->disk
, rzs
->disksize
>> SECTOR_SHIFT
);
592 /* ramzswap devices sort of resembles non-rotational disks */
593 queue_flag_set_unlocked(QUEUE_FLAG_NONROT
, rzs
->disk
->queue
);
595 rzs
->mem_pool
= xv_create_pool();
596 if (!rzs
->mem_pool
) {
597 pr_err("Error creating memory pool\n");
604 pr_debug("Initialization done!\n");
610 pr_err("Initialization failed: err=%d\n", ret
);
614 static int ramzswap_ioctl_reset_device(struct ramzswap
*rzs
)
622 static int ramzswap_ioctl(struct block_device
*bdev
, fmode_t mode
,
623 unsigned int cmd
, unsigned long arg
)
628 struct ramzswap
*rzs
= bdev
->bd_disk
->private_data
;
631 case RZSIO_SET_DISKSIZE_KB
:
632 if (rzs
->init_done
) {
636 if (copy_from_user(&disksize_kb
, (void *)arg
,
641 rzs
->disksize
= disksize_kb
<< 10;
642 pr_info("Disk size set to %zu kB\n", disksize_kb
);
645 case RZSIO_GET_STATS
:
647 struct ramzswap_ioctl_stats
*stats
;
648 if (!rzs
->init_done
) {
652 stats
= kzalloc(sizeof(*stats
), GFP_KERNEL
);
657 ramzswap_ioctl_get_stats(rzs
, stats
);
658 if (copy_to_user((void *)arg
, stats
, sizeof(*stats
))) {
667 ret
= ramzswap_ioctl_init_device(rzs
);
671 /* Do not reset an active device! */
672 if (bdev
->bd_holders
) {
677 /* Make sure all pending I/O is finished */
681 ret
= ramzswap_ioctl_reset_device(rzs
);
685 pr_info("Invalid ioctl %u\n", cmd
);
693 static struct block_device_operations ramzswap_devops
= {
694 .ioctl
= ramzswap_ioctl
,
695 .owner
= THIS_MODULE
,
698 static int create_device(struct ramzswap
*rzs
, int device_id
)
702 mutex_init(&rzs
->lock
);
703 spin_lock_init(&rzs
->stat64_lock
);
705 rzs
->queue
= blk_alloc_queue(GFP_KERNEL
);
707 pr_err("Error allocating disk queue for device %d\n",
713 blk_queue_make_request(rzs
->queue
, ramzswap_make_request
);
714 rzs
->queue
->queuedata
= rzs
;
716 /* gendisk structure */
717 rzs
->disk
= alloc_disk(1);
719 blk_cleanup_queue(rzs
->queue
);
720 pr_warning("Error allocating disk structure for device %d\n",
726 rzs
->disk
->major
= ramzswap_major
;
727 rzs
->disk
->first_minor
= device_id
;
728 rzs
->disk
->fops
= &ramzswap_devops
;
729 rzs
->disk
->queue
= rzs
->queue
;
730 rzs
->disk
->private_data
= rzs
;
731 snprintf(rzs
->disk
->disk_name
, 16, "ramzswap%d", device_id
);
733 /* Actual capacity set using RZSIO_SET_DISKSIZE_KB ioctl */
734 set_capacity(rzs
->disk
, 0);
736 blk_queue_physical_block_size(rzs
->disk
->queue
, PAGE_SIZE
);
737 blk_queue_logical_block_size(rzs
->disk
->queue
, PAGE_SIZE
);
747 static void destroy_device(struct ramzswap
*rzs
)
750 del_gendisk(rzs
->disk
);
755 blk_cleanup_queue(rzs
->queue
);
758 static int __init
ramzswap_init(void)
762 if (num_devices
> max_num_devices
) {
763 pr_warning("Invalid value for num_devices: %u\n",
769 ramzswap_major
= register_blkdev(0, "ramzswap");
770 if (ramzswap_major
<= 0) {
771 pr_warning("Unable to get major number\n");
777 pr_info("num_devices not specified. Using default: 1\n");
781 /* Allocate the device array and initialize each one */
782 pr_info("Creating %u devices ...\n", num_devices
);
783 devices
= kzalloc(num_devices
* sizeof(struct ramzswap
), GFP_KERNEL
);
789 for (dev_id
= 0; dev_id
< num_devices
; dev_id
++) {
790 ret
= create_device(&devices
[dev_id
], dev_id
);
799 destroy_device(&devices
[--dev_id
]);
801 unregister_blkdev(ramzswap_major
, "ramzswap");
806 static void __exit
ramzswap_exit(void)
809 struct ramzswap
*rzs
;
811 for (i
= 0; i
< num_devices
; i
++) {
819 unregister_blkdev(ramzswap_major
, "ramzswap");
822 pr_debug("Cleanup done!\n");
825 module_param(num_devices
, uint
, 0);
826 MODULE_PARM_DESC(num_devices
, "Number of ramzswap devices");
828 module_init(ramzswap_init
);
829 module_exit(ramzswap_exit
);
831 MODULE_LICENSE("Dual BSD/GPL");
832 MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
833 MODULE_DESCRIPTION("Compressed RAM Based Swap Device");