]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/commitdiff
Merge branch 'akpm' (patches from Andrew)
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 17 Feb 2015 16:38:30 +0000 (08:38 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 17 Feb 2015 16:38:30 +0000 (08:38 -0800)
Merge fifth set of updates from Andrew Morton:

 - A few things which were awaiting merges from linux-next:
     - rtc
     - ocfs2
     - misc others

 - Willy's "dax" feature: direct fs access to memory (mainly NV-DIMMs)
   which isn't backed by pageframes.

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (37 commits)
  rtc: add driver for DS1685 family of real time clocks
  MAINTAINERS: add entry for Maxim PMICs on Samsung boards
  lib/Kconfig: use bool instead of boolean
  powerpc: drop _PAGE_FILE and pte_file()-related helpers
  ocfs2: set append dio as a ro compat feature
  ocfs2: wait for orphan recovery first once append O_DIRECT write crash
  ocfs2: complete the rest request through buffer io
  ocfs2: do not fallback to buffer I/O write if appending
  ocfs2: allocate blocks in ocfs2_direct_IO_get_blocks
  ocfs2: implement ocfs2_direct_IO_write
  ocfs2: add orphan recovery types in ocfs2_recover_orphans
  ocfs2: add functions to add and remove inode in orphan dir
  ocfs2: prepare some interfaces used in append direct io
  MAINTAINERS: fix spelling mistake & remove trailing WS
  dax: does not work correctly with virtual aliasing caches
  brd: rename XIP to DAX
  ext4: add DAX functionality
  dax: add dax_zero_page_range
  ext2: get rid of most mentions of XIP in ext2
  ext2: remove ext2_aops_xip
  ...

72 files changed:
Documentation/filesystems/00-INDEX
Documentation/filesystems/Locking
Documentation/filesystems/dax.txt [new file with mode: 0644]
Documentation/filesystems/ext2.txt
Documentation/filesystems/ext4.txt
Documentation/filesystems/vfs.txt
Documentation/filesystems/xip.txt [deleted file]
MAINTAINERS
arch/arm/boot/dts/zynq-parallella.dts
arch/powerpc/include/asm/pgtable-ppc32.h
arch/powerpc/include/asm/pgtable-ppc64.h
arch/powerpc/include/asm/pgtable.h
arch/powerpc/include/asm/pte-40x.h
arch/powerpc/include/asm/pte-44x.h
arch/powerpc/include/asm/pte-8xx.h
arch/powerpc/include/asm/pte-book3e.h
arch/powerpc/include/asm/pte-fsl-booke.h
arch/powerpc/include/asm/pte-hash32.h
arch/powerpc/include/asm/pte-hash64.h
arch/powerpc/mm/pgtable_64.c
drivers/block/Kconfig
drivers/block/brd.c
drivers/rtc/Kconfig
drivers/rtc/Makefile
drivers/rtc/rtc-ds1685.c [new file with mode: 0644]
drivers/rtc/rtc-isl12022.c
drivers/rtc/rtc-isl12057.c
drivers/staging/iio/light/isl29028.c
fs/Kconfig
fs/Makefile
fs/dax.c [new file with mode: 0644]
fs/exofs/inode.c
fs/ext2/Kconfig
fs/ext2/Makefile
fs/ext2/ext2.h
fs/ext2/file.c
fs/ext2/inode.c
fs/ext2/namei.c
fs/ext2/super.c
fs/ext2/xip.c [deleted file]
fs/ext2/xip.h [deleted file]
fs/ext4/ext4.h
fs/ext4/file.c
fs/ext4/indirect.c
fs/ext4/inode.c
fs/ext4/namei.c
fs/ext4/super.c
fs/ocfs2/aops.c
fs/ocfs2/file.c
fs/ocfs2/file.h
fs/ocfs2/inode.c
fs/ocfs2/inode.h
fs/ocfs2/journal.c
fs/ocfs2/journal.h
fs/ocfs2/namei.c
fs/ocfs2/namei.h
fs/ocfs2/ocfs2.h
fs/ocfs2/ocfs2_fs.h
fs/ocfs2/super.c
fs/open.c
include/linux/fs.h
include/linux/mm.h
include/linux/rmap.h
include/linux/rtc/ds1685.h [new file with mode: 0644]
lib/Kconfig
mm/Makefile
mm/fadvise.c
mm/filemap.c
mm/filemap_xip.c [deleted file]
mm/madvise.c
mm/memory.c
scripts/diffconfig

index ac28149aede4c15704aaee0b1941aebe30f25eb7..9922939e7d99e3dfbcd51d2dad611f745a9a4f94 100644 (file)
@@ -34,6 +34,9 @@ configfs/
        - directory containing configfs documentation and example code.
 cramfs.txt
        - info on the cram filesystem for small storage (ROMs etc).
+dax.txt
+       - info on avoiding the page cache for files stored on CPU-addressable
+         storage devices.
 debugfs.txt
        - info on the debugfs filesystem.
 devpts.txt
@@ -154,5 +157,3 @@ xfs-self-describing-metadata.txt
        - info on XFS Self Describing Metadata.
 xfs.txt
        - info and mount options for the XFS filesystem.
-xip.txt
-       - info on execute-in-place for file mappings.
index b30753cbf4311641ebd0ffe62735de05b452e77a..2ca3d17eee56380cec075d5279c2703fa2b1ed9e 100644 (file)
@@ -199,8 +199,6 @@ prototypes:
        int (*releasepage) (struct page *, int);
        void (*freepage)(struct page *);
        int (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset);
-       int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **,
-                               unsigned long *);
        int (*migratepage)(struct address_space *, struct page *, struct page *);
        int (*launder_page)(struct page *);
        int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long);
@@ -225,7 +223,6 @@ invalidatepage:             yes
 releasepage:           yes
 freepage:              yes
 direct_IO:
-get_xip_mem:                                   maybe
 migratepage:           yes (both)
 launder_page:          yes
 is_partially_uptodate: yes
diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
new file mode 100644 (file)
index 0000000..baf4111
--- /dev/null
@@ -0,0 +1,94 @@
+Direct Access for files
+-----------------------
+
+Motivation
+----------
+
+The page cache is usually used to buffer reads and writes to files.
+It is also used to provide the pages which are mapped into userspace
+by a call to mmap.
+
+For block devices that are memory-like, the page cache pages would be
+unnecessary copies of the original storage.  The DAX code removes the
+extra copy by performing reads and writes directly to the storage device.
+For file mappings, the storage device is mapped directly into userspace.
+
+
+Usage
+-----
+
+If you have a block device which supports DAX, you can make a filesystem
+on it as usual.  When mounting it, use the -o dax option manually
+or add 'dax' to the options in /etc/fstab.
+
+
+Implementation Tips for Block Driver Writers
+--------------------------------------------
+
+To support DAX in your block driver, implement the 'direct_access'
+block device operation.  It is used to translate the sector number
+(expressed in units of 512-byte sectors) to a page frame number (pfn)
+that identifies the physical page for the memory.  It also returns a
+kernel virtual address that can be used to access the memory.
+
+The direct_access method takes a 'size' parameter that indicates the
+number of bytes being requested.  The function should return the number
+of bytes that can be contiguously accessed at that offset.  It may also
+return a negative errno if an error occurs.
+
+In order to support this method, the storage must be byte-accessible by
+the CPU at all times.  If your device uses paging techniques to expose
+a large amount of memory through a smaller window, then you cannot
+implement direct_access.  Equally, if your device can occasionally
+stall the CPU for an extended period, you should also not attempt to
+implement direct_access.
+
+These block devices may be used for inspiration:
+- axonram: Axon DDR2 device driver
+- brd: RAM backed block device driver
+- dcssblk: s390 dcss block device driver
+
+
+Implementation Tips for Filesystem Writers
+------------------------------------------
+
+Filesystem support consists of
+- adding support to mark inodes as being DAX by setting the S_DAX flag in
+  i_flags
+- implementing the direct_IO address space operation, and calling
+  dax_do_io() instead of blockdev_direct_IO() if S_DAX is set
+- implementing an mmap file operation for DAX files which sets the
+  VM_MIXEDMAP flag on the VMA, and setting the vm_ops to include handlers
+  for fault and page_mkwrite (which should probably call dax_fault() and
+  dax_mkwrite(), passing the appropriate get_block() callback)
+- calling dax_truncate_page() instead of block_truncate_page() for DAX files
+- calling dax_zero_page_range() instead of zero_user() for DAX files
+- ensuring that there is sufficient locking between reads, writes,
+  truncates and page faults
+
+The get_block() callback passed to the DAX functions may return
+uninitialised extents.  If it does, it must ensure that simultaneous
+calls to get_block() (for example by a page-fault racing with a read()
+or a write()) work correctly.
+
+These filesystems may be used for inspiration:
+- ext2: the second extended filesystem, see Documentation/filesystems/ext2.txt
+- ext4: the fourth extended filesystem, see Documentation/filesystems/ext4.txt
+
+
+Shortcomings
+------------
+
+Even if the kernel or its modules are stored on a filesystem that supports
+DAX on a block device that supports DAX, they will still be copied into RAM.
+
+The DAX code does not work correctly on architectures which have virtually
+mapped caches such as ARM, MIPS and SPARC.
+
+Calling get_user_pages() on a range of user memory that has been mmaped
+from a DAX file will fail as there are no 'struct page' to describe
+those pages.  This problem is being worked on.  That means that O_DIRECT
+reads/writes to those memory ranges from a non-DAX file will fail (note
+that O_DIRECT reads/writes _of a DAX file_ do work, it is the memory
+that is being accessed that is key here).  Other things that will not
+work include RDMA, sendfile() and splice().
index 67639f905f10dfca60b950b1317dbc0533b89a5e..b9714569e472b7469911f22ecefd48eaefd84a90 100644 (file)
@@ -20,6 +20,9 @@ minixdf                               Makes `df' act like Minix.
 check=none, nocheck    (*)     Don't do extra checking of bitmaps on mount
                                (check=normal and check=strict options removed)
 
+dax                            Use direct access (no page cache).  See
+                               Documentation/filesystems/dax.txt.
+
 debug                          Extra debugging information is sent to the
                                kernel syslog.  Useful for developers.
 
@@ -56,8 +59,6 @@ noacl                         Don't support POSIX ACLs.
 
 nobh                           Do not attach buffer_heads to file pagecache.
 
-xip                            Use execute in place (no caching) if possible
-
 grpquota,noquota,quota,usrquota        Quota options are silently ignored by ext2.
 
 
index 919a3293aaa4249f81f0d907d9867d6d80f9a135..6c0108eb01372be3c879a43bd99f87305efb3277 100644 (file)
@@ -386,6 +386,10 @@ max_dir_size_kb=n  This limits the size of directories so that any
 i_version              Enable 64-bit inode version support. This option is
                        off by default.
 
+dax                    Use direct access (no page cache).  See
+                       Documentation/filesystems/dax.txt.  Note that
+                       this option is incompatible with data=journal.
+
 Data Mode
 =========
 There are 3 different data modes:
index 43ce0507ee25bca1799e6d29b294047626d00c17..966b22829f3b605b92f19ce43665225ab17a8a7e 100644 (file)
@@ -591,8 +591,6 @@ struct address_space_operations {
        int (*releasepage) (struct page *, int);
        void (*freepage)(struct page *);
        ssize_t (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset);
-       struct page* (*get_xip_page)(struct address_space *, sector_t,
-                       int);
        /* migrate the contents of a page to the specified target */
        int (*migratepage) (struct page *, struct page *);
        int (*launder_page) (struct page *);
@@ -748,11 +746,6 @@ struct address_space_operations {
         and transfer data directly between the storage and the
         application's address space.
 
-  get_xip_page: called by the VM to translate a block number to a page.
-       The page is valid until the corresponding filesystem is unmounted.
-       Filesystems that want to use execute-in-place (XIP) need to implement
-       it.  An example implementation can be found in fs/ext2/xip.c.
-
   migrate_page:  This is used to compact the physical memory usage.
         If the VM wants to relocate a page (maybe off a memory card
         that is signalling imminent failure) it will pass a new page
diff --git a/Documentation/filesystems/xip.txt b/Documentation/filesystems/xip.txt
deleted file mode 100644 (file)
index b774729..0000000
+++ /dev/null
@@ -1,71 +0,0 @@
-Execute-in-place for file mappings
-----------------------------------
-
-Motivation
-----------
-File mappings are performed by mapping page cache pages to userspace. In
-addition, read&write type file operations also transfer data from/to the page
-cache.
-
-For memory backed storage devices that use the block device interface, the page
-cache pages are in fact copies of the original storage. Various approaches
-exist to work around the need for an extra copy. The ramdisk driver for example
-does read the data into the page cache, keeps a reference, and discards the
-original data behind later on.
-
-Execute-in-place solves this issue the other way around: instead of keeping
-data in the page cache, the need to have a page cache copy is eliminated
-completely. With execute-in-place, read&write type operations are performed
-directly from/to the memory backed storage device. For file mappings, the
-storage device itself is mapped directly into userspace.
-
-This implementation was initially written for shared memory segments between
-different virtual machines on s390 hardware to allow multiple machines to
-share the same binaries and libraries.
-
-Implementation
---------------
-Execute-in-place is implemented in three steps: block device operation,
-address space operation, and file operations.
-
-A block device operation named direct_access is used to translate the
-block device sector number to a page frame number (pfn) that identifies
-the physical page for the memory.  It also returns a kernel virtual
-address that can be used to access the memory.
-
-The direct_access method takes a 'size' parameter that indicates the
-number of bytes being requested.  The function should return the number
-of bytes that can be contiguously accessed at that offset.  It may also
-return a negative errno if an error occurs.
-
-The block device operation is optional, these block devices support it as of
-today:
-- dcssblk: s390 dcss block device driver
-
-An address space operation named get_xip_mem is used to retrieve references
-to a page frame number and a kernel address. To obtain these values a reference
-to an address_space is provided. This function assigns values to the kmem and
-pfn parameters. The third argument indicates whether the function should allocate
-blocks if needed.
-
-This address space operation is mutually exclusive with readpage&writepage that
-do page cache read/write operations.
-The following filesystems support it as of today:
-- ext2: the second extended filesystem, see Documentation/filesystems/ext2.txt
-
-A set of file operations that do utilize get_xip_page can be found in
-mm/filemap_xip.c . The following file operation implementations are provided:
-- aio_read/aio_write
-- readv/writev
-- sendfile
-
-The generic file operations do_sync_read/do_sync_write can be used to implement
-classic synchronous IO calls.
-
-Shortcomings
-------------
-This implementation is limited to storage devices that are cpu addressable at
-all times (no highmem or such). It works well on rom/ram, but enhancements are
-needed to make it work with flash in read+write mode.
-Putting the Linux kernel and/or its modules on a xip filesystem does not mean
-they are not copied.
index 0beaaac20a83e89058d6ad90488038a60e43aa15..e75c2184081567245e646c0cc55ff469f3d805bb 100644 (file)
@@ -34,7 +34,7 @@ trivial patch so apply some common sense.
        generalized kernel feature ready for next time.
 
        PLEASE check your patch with the automated style checker
-       (scripts/checkpatch.pl) to catch trival style violations.
+       (scripts/checkpatch.pl) to catch trivial style violations.
        See Documentation/CodingStyle for guidance here.
 
        PLEASE CC: the maintainers and mailing lists that are generated
@@ -2965,6 +2965,12 @@ S:       Supported
 F:     drivers/input/touchscreen/cyttsp*
 F:     include/linux/input/cyttsp.h
 
+DALLAS/MAXIM DS1685-FAMILY REAL TIME CLOCK
+M:     Joshua Kinard <kumba@gentoo.org>
+S:     Maintained
+F:     drivers/rtc/rtc-ds1685.c
+F:     include/linux/rtc/ds1685.h
+
 DAMA SLAVE for AX.25
 M:     Joerg Reuter <jreuter@yaina.de>
 W:     http://yaina.de/jreuter/
@@ -3153,6 +3159,12 @@ L:       linux-i2c@vger.kernel.org
 S:     Maintained
 F:     drivers/i2c/busses/i2c-diolan-u2c.c
 
+DIRECT ACCESS (DAX)
+M:     Matthew Wilcox <willy@linux.intel.com>
+L:     linux-fsdevel@vger.kernel.org
+S:     Supported
+F:     fs/dax.c
+
 DIRECTORY NOTIFICATION (DNOTIFY)
 M:     Eric Paris <eparis@parisplace.org>
 S:     Maintained
@@ -6212,6 +6224,26 @@ S:       Supported
 F:     drivers/power/max14577_charger.c
 F:     drivers/power/max77693_charger.c
 
+MAXIM PMIC AND MUIC DRIVERS FOR EXYNOS BASED BOARDS
+M:     Chanwoo Choi <cw00.choi@samsung.com>
+M:     Krzysztof Kozlowski <k.kozlowski@samsung.com>
+L:     linux-kernel@vger.kernel.org
+S:     Supported
+F:     drivers/*/max14577.c
+F:     drivers/*/max77686.c
+F:     drivers/*/max77693.c
+F:     drivers/extcon/extcon-max14577.c
+F:     drivers/extcon/extcon-max77693.c
+F:     drivers/rtc/rtc-max77686.c
+F:     drivers/clk/clk-max77686.c
+F:     Documentation/devicetree/bindings/mfd/max14577.txt
+F:     Documentation/devicetree/bindings/mfd/max77686.txt
+F:     Documentation/devicetree/bindings/mfd/max77693.txt
+F:     Documentation/devicetree/bindings/clock/maxim,max77686.txt
+F:     include/linux/mfd/max14577*.h
+F:     include/linux/mfd/max77686*.h
+F:     include/linux/mfd/max77693*.h
+
 MAXIRADIO FM RADIO RECEIVER DRIVER
 M:     Hans Verkuil <hverkuil@xs4all.nl>
 L:     linux-media@vger.kernel.org
index ab1dc0a56cddc0209518842436d37403cb927278..174571232ea5e1bdad51daf3a068187de054c331 100644 (file)
@@ -58,7 +58,7 @@
        status = "okay";
 
        isl9305: isl9305@68 {
-               compatible = "isl,isl9305";
+               compatible = "isil,isl9305";
                reg = <0x68>;
 
                regulators {
index 14bdcbd3167021e1f797bd27813ea59044093836..64b52b1cf5425dcaad0a9cc06ea77037675aa88f 100644 (file)
@@ -333,8 +333,8 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
 /*
  * Encode and decode a swap entry.
  * Note that the bits we use in a PTE for representing a swap entry
- * must not include the _PAGE_PRESENT bit, the _PAGE_FILE bit, or the
- *_PAGE_HASHPTE bit (if used).  -- paulus
+ * must not include the _PAGE_PRESENT bit or the _PAGE_HASHPTE bit (if used).
+ *   -- paulus
  */
 #define __swp_type(entry)              ((entry).val & 0x1f)
 #define __swp_offset(entry)            ((entry).val >> 5)
@@ -342,11 +342,6 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
 #define __pte_to_swp_entry(pte)                ((swp_entry_t) { pte_val(pte) >> 3 })
 #define __swp_entry_to_pte(x)          ((pte_t) { (x).val << 3 })
 
-/* Encode and decode a nonlinear file mapping entry */
-#define PTE_FILE_MAX_BITS      29
-#define pte_to_pgoff(pte)      (pte_val(pte) >> 3)
-#define pgoff_to_pte(off)      ((pte_t) { ((off) << 3) | _PAGE_FILE })
-
 #ifndef CONFIG_PPC_4K_PAGES
 void pgtable_cache_init(void);
 #else
index d46532ccc386a380341fffa16ef40a77e0988502..43e6ad424c7fc30503db061360fbd1565811b17d 100644 (file)
@@ -352,9 +352,6 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
 #define __swp_entry(type, offset) ((swp_entry_t){((type)<< 1)|((offset)<<8)})
 #define __pte_to_swp_entry(pte)        ((swp_entry_t){pte_val(pte) >> PTE_RPN_SHIFT})
 #define __swp_entry_to_pte(x)  ((pte_t) { (x).val << PTE_RPN_SHIFT })
-#define pte_to_pgoff(pte)      (pte_val(pte) >> PTE_RPN_SHIFT)
-#define pgoff_to_pte(off)      ((pte_t) {((off) << PTE_RPN_SHIFT)|_PAGE_FILE})
-#define PTE_FILE_MAX_BITS      (BITS_PER_LONG - PTE_RPN_SHIFT)
 
 void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
 void pgtable_cache_init(void);
@@ -389,7 +386,7 @@ void pgtable_cache_init(void);
  * The last three bits are intentionally left to zero. This memory location
  * are also used as normal page PTE pointers. So if we have any pointers
  * left around while we collapse a hugepage, we need to make sure
- * _PAGE_PRESENT and _PAGE_FILE bits of that are zero when we look at them
+ * _PAGE_PRESENT bit of that is zero when we look at them
  */
 static inline unsigned int hpte_valid(unsigned char *hpte_slot_array, int index)
 {
index 79fee2eb8d56f114e7e72305d0ad5b5c8e41379f..9835ac4173b75c2c3db2ec14e062516a117c27bf 100644 (file)
@@ -34,7 +34,6 @@ static inline int pte_write(pte_t pte)
 {      return (pte_val(pte) & (_PAGE_RW | _PAGE_RO)) != _PAGE_RO; }
 static inline int pte_dirty(pte_t pte)         { return pte_val(pte) & _PAGE_DIRTY; }
 static inline int pte_young(pte_t pte)         { return pte_val(pte) & _PAGE_ACCESSED; }
-static inline int pte_file(pte_t pte)          { return pte_val(pte) & _PAGE_FILE; }
 static inline int pte_special(pte_t pte)       { return pte_val(pte) & _PAGE_SPECIAL; }
 static inline int pte_none(pte_t pte)          { return (pte_val(pte) & ~_PTE_NONE_MASK) == 0; }
 static inline pgprot_t pte_pgprot(pte_t pte)   { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); }
index ec0b0b0d1df9864aa6f7520cb56397e3ef4e7ea6..486b1ef813387975832055c2b2079b3d53bd98d9 100644 (file)
@@ -38,7 +38,6 @@
  */
 
 #define        _PAGE_GUARDED   0x001   /* G: page is guarded from prefetch */
-#define _PAGE_FILE     0x001   /* when !present: nonlinear file mapping */
 #define _PAGE_PRESENT  0x002   /* software: PTE contains a translation */
 #define        _PAGE_NO_CACHE  0x004   /* I: caching is inhibited */
 #define        _PAGE_WRITETHRU 0x008   /* W: caching is write-through */
index 4192b9bad90164b41d928685547fa32ddf0aafd0..36f75fab23f52414d6c1dda923db8d6199b4cc1b 100644 (file)
@@ -44,9 +44,6 @@
  *   - PRESENT *must* be in the bottom three bits because swap cache
  *     entries use the top 29 bits for TLB2.
  *
- *   - FILE *must* be in the bottom three bits because swap cache
- *     entries use the top 29 bits for TLB2.
- *
  *   - CACHE COHERENT bit (M) has no effect on original PPC440 cores,
  *     because it doesn't support SMP. However, some later 460 variants
  *     have -some- form of SMP support and so I keep the bit there for
@@ -68,7 +65,6 @@
  *
  * There are three protection bits available for SWAP entry:
  *     _PAGE_PRESENT
- *     _PAGE_FILE
  *     _PAGE_HASHPTE (if HW has)
  *
  * So those three bits have to be inside of 0-2nd LSB of PTE.
@@ -77,7 +73,6 @@
 
 #define _PAGE_PRESENT  0x00000001              /* S: PTE valid */
 #define _PAGE_RW       0x00000002              /* S: Write permission */
-#define _PAGE_FILE     0x00000004              /* S: nonlinear file mapping */
 #define _PAGE_EXEC     0x00000004              /* H: Execute permission */
 #define _PAGE_ACCESSED 0x00000008              /* S: Page referenced */
 #define _PAGE_DIRTY    0x00000010              /* S: Page dirty */
index eb6edb44f14042a62ba9c4b718441615537761b9..97bae64afdaabd70ee30768f3dc785410cba1aa0 100644 (file)
@@ -29,7 +29,6 @@
 
 /* Definitions for 8xx embedded chips. */
 #define _PAGE_PRESENT  0x0001  /* Page is valid */
-#define _PAGE_FILE     0x0002  /* when !present: nonlinear file mapping */
 #define _PAGE_NO_CACHE 0x0002  /* I: cache inhibit */
 #define _PAGE_SHARED   0x0004  /* No ASID (context) compare */
 #define _PAGE_SPECIAL  0x0008  /* SW entry, forced to 0 by the TLB miss */
index 576ad88104cbb4596a3b389eb64e9eb6f3a31e5e..91a704952ca1a96234b9088c4493b6371bbbefcf 100644 (file)
@@ -10,7 +10,6 @@
 
 /* Architected bits */
 #define _PAGE_PRESENT  0x000001 /* software: pte contains a translation */
-#define _PAGE_FILE     0x000002 /* (!present only) software: pte holds file offset */
 #define _PAGE_SW1      0x000002
 #define _PAGE_BAP_SR   0x000004
 #define _PAGE_BAP_UR   0x000008
index e84dd7ed505eb7394716e3a583533924b1cdfedb..9f5c3d04a1a3ea3f3da542028b002fe304b9002f 100644 (file)
    - PRESENT *must* be in the bottom three bits because swap cache
      entries use the top 29 bits.
 
-   - FILE *must* be in the bottom three bits because swap cache
-     entries use the top 29 bits.
 */
 
 /* Definitions for FSL Book-E Cores */
 #define _PAGE_PRESENT  0x00001 /* S: PTE contains a translation */
 #define _PAGE_USER     0x00002 /* S: User page (maps to UR) */
-#define _PAGE_FILE     0x00002 /* S: when !present: nonlinear file mapping */
 #define _PAGE_RW       0x00004 /* S: Write permission (SW) */
 #define _PAGE_DIRTY    0x00008 /* S: Page dirty */
 #define _PAGE_EXEC     0x00010 /* H: SX permission */
index 4aad4132d0a87fa3e677ebe00b05d0d317fe7292..62cfb0c663bb9e513bfded47604f007dc3222865 100644 (file)
@@ -18,7 +18,6 @@
 
 #define _PAGE_PRESENT  0x001   /* software: pte contains a translation */
 #define _PAGE_HASHPTE  0x002   /* hash_page has made an HPTE for this pte */
-#define _PAGE_FILE     0x004   /* when !present: nonlinear file mapping */
 #define _PAGE_USER     0x004   /* usermode access allowed */
 #define _PAGE_GUARDED  0x008   /* G: prohibit speculative access */
 #define _PAGE_COHERENT 0x010   /* M: enforce memory coherence (SMP systems) */
index 55aea0caf95ebc78affe30e78d221bf5c00006cb..fc852f7e7b3a63f86e94f0cb43b62eb2b11fc3a3 100644 (file)
@@ -16,7 +16,6 @@
  */
 #define _PAGE_PRESENT          0x0001 /* software: pte contains a translation */
 #define _PAGE_USER             0x0002 /* matches one of the PP bits */
-#define _PAGE_FILE             0x0002 /* (!present only) software: pte holds file offset */
 #define _PAGE_EXEC             0x0004 /* No execute on POWER4 and newer (we invert) */
 #define _PAGE_GUARDED          0x0008
 /* We can derive Memory coherence from _PAGE_NO_CACHE */
index 91bb8836825a6f516a0000525b9f44b69799d42b..6957cc1ca0a7f42b2980161b04378f2d56d5b964 100644 (file)
@@ -782,7 +782,7 @@ pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
 {
        pmd_t pmd;
        /*
-        * For a valid pte, we would have _PAGE_PRESENT or _PAGE_FILE always
+        * For a valid pte, we would have _PAGE_PRESENT always
         * set. We use this to check THP page at pmd level.
         * leaf pte for huge page, bottom two bits != 00
         */
index 014a1cfc41c51fe5d69f0d76a31607d1834ba953..1b8094d4d7af70f38e38bd9d3f5736a55f31dd82 100644 (file)
@@ -393,14 +393,15 @@ config BLK_DEV_RAM_SIZE
          The default value is 4096 kilobytes. Only change this if you know
          what you are doing.
 
-config BLK_DEV_XIP
-       bool "Support XIP filesystems on RAM block device"
-       depends on BLK_DEV_RAM
+config BLK_DEV_RAM_DAX
+       bool "Support Direct Access (DAX) to RAM block devices"
+       depends on BLK_DEV_RAM && FS_DAX
        default n
        help
-         Support XIP filesystems (such as ext2 with XIP support on) on
-         top of block ram device. This will slightly enlarge the kernel, and
-         will prevent RAM block device backing store memory from being
+         Support filesystems using DAX to access RAM block devices.  This
+         avoids double-buffering data in the page cache before copying it
+         to the block device.  Answering Y will slightly enlarge the kernel,
+         and will prevent RAM block device backing store memory from being
          allocated from highmem (only a problem for highmem systems).
 
 config CDROM_PKTCDVD
index c01b921b1b4a8a492188dee63c9ace3665aa2498..64ab4951e9d678f737423439bce5fce7b28f72fc 100644 (file)
@@ -97,13 +97,13 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
         * Must use NOIO because we don't want to recurse back into the
         * block or filesystem layers from page reclaim.
         *
-        * Cannot support XIP and highmem, because our ->direct_access
-        * routine for XIP must return memory that is always addressable.
-        * If XIP was reworked to use pfns and kmap throughout, this
+        * Cannot support DAX and highmem, because our ->direct_access
+        * routine for DAX must return memory that is always addressable.
+        * If DAX was reworked to use pfns and kmap throughout, this
         * restriction might be able to be lifted.
         */
        gfp_flags = GFP_NOIO | __GFP_ZERO;
-#ifndef CONFIG_BLK_DEV_XIP
+#ifndef CONFIG_BLK_DEV_RAM_DAX
        gfp_flags |= __GFP_HIGHMEM;
 #endif
        page = alloc_page(gfp_flags);
@@ -369,7 +369,7 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
        return err;
 }
 
-#ifdef CONFIG_BLK_DEV_XIP
+#ifdef CONFIG_BLK_DEV_RAM_DAX
 static long brd_direct_access(struct block_device *bdev, sector_t sector,
                        void **kaddr, unsigned long *pfn, long size)
 {
@@ -390,6 +390,8 @@ static long brd_direct_access(struct block_device *bdev, sector_t sector,
         */
        return PAGE_SIZE;
 }
+#else
+#define brd_direct_access NULL
 #endif
 
 static int brd_ioctl(struct block_device *bdev, fmode_t mode,
@@ -430,9 +432,7 @@ static const struct block_device_operations brd_fops = {
        .owner =                THIS_MODULE,
        .rw_page =              brd_rw_page,
        .ioctl =                brd_ioctl,
-#ifdef CONFIG_BLK_DEV_XIP
        .direct_access =        brd_direct_access,
-#endif
 };
 
 /*
index 3bc9ddbe5cf700e4934515bc8b57e9ef487fff51..0cf2e1d9cb17b7ecd8fc6289aac2711295e9d96b 100644 (file)
@@ -801,6 +801,96 @@ config RTC_DRV_DS1553
          This driver can also be built as a module. If so, the module
          will be called rtc-ds1553.
 
+config RTC_DRV_DS1685_FAMILY
+       tristate "Dallas/Maxim DS1685 Family"
+       help
+         If you say yes here you get support for the Dallas/Maxim DS1685
+         family of real time chips.  This family includes the DS1685/DS1687,
+         DS1689/DS1693, DS17285/DS17287, DS17485/DS17487, and
+         DS17885/DS17887 chips.
+
+         This driver can also be built as a module. If so, the module
+         will be called rtc-ds1685.
+
+choice
+       prompt "Subtype"
+       depends on RTC_DRV_DS1685_FAMILY
+       default RTC_DRV_DS1685
+
+config RTC_DRV_DS1685
+       bool "DS1685/DS1687"
+       help
+         This enables support for the Dallas/Maxim DS1685/DS1687 real time
+         clock chip.
+
+         This chip is commonly found in SGI O2 (IP32) and SGI Octane (IP30)
+         systems, as well as EPPC-405-UC modules by electronic system design
+         GmbH.
+
+config RTC_DRV_DS1689
+       bool "DS1689/DS1693"
+       help
+         This enables support for the Dallas/Maxim DS1689/DS1693 real time
+         clock chip.
+
+         This is an older RTC chip, supplanted by the DS1685/DS1687 above,
+         which supports a few minor features such as Vcc, Vbat, and Power
+         Cycle counters, plus a customer-specific, 8-byte ROM/Serial number.
+
+         It also works for the even older DS1688/DS1691 RTC chips, which are
+         virtually the same and carry the same model number.  Both chips
+         have 114 bytes of user NVRAM.
+
+config RTC_DRV_DS17285
+       bool "DS17285/DS17287"
+       help
+         This enables support for the Dallas/Maxim DS17285/DS17287 real time
+         clock chip.
+
+         This chip features 2kb of extended NV-SRAM.  It may possibly be
+         found in some SGI O2 systems (rare).
+
+config RTC_DRV_DS17485
+       bool "DS17485/DS17487"
+       help
+         This enables support for the Dallas/Maxim DS17485/DS17487 real time
+         clock chip.
+
+         This chip features 4kb of extended NV-SRAM.
+
+config RTC_DRV_DS17885
+       bool "DS17885/DS17887"
+       help
+         This enables support for the Dallas/Maxim DS17885/DS17887 real time
+         clock chip.
+
+         This chip features 8kb of extended NV-SRAM.
+
+endchoice
+
+config RTC_DS1685_PROC_REGS
+       bool "Display register values in /proc"
+       depends on RTC_DRV_DS1685_FAMILY && PROC_FS
+       help
+         Enable this to display a readout of all of the RTC registers in
+         /proc/drivers/rtc.  Keep in mind that this can potentially lead
+         to lost interrupts, as reading Control Register C will clear
+         all pending IRQ flags.
+
+         Unless you are debugging this driver, choose N.
+
+config RTC_DS1685_SYSFS_REGS
+       bool "SysFS access to RTC register bits"
+       depends on RTC_DRV_DS1685_FAMILY && SYSFS
+       help
+         Enable this to provide access to the RTC control register bits
+         in /sys.  Some of the bits are read-write, others are read-only.
+
+         Keep in mind that reading Control C's bits automatically clears
+         all pending IRQ flags - this can cause lost interrupts.
+
+         If you know that you need access to these bits, choose Y, Else N.
+
 config RTC_DRV_DS1742
        tristate "Maxim/Dallas DS1742/1743"
        depends on HAS_IOMEM
index 99ded8b75e9523826f79424fac92aedc1a1b1f57..69c87062b098e3e74fa4fad3c7f059b695cd3f8e 100644 (file)
@@ -54,6 +54,7 @@ obj-$(CONFIG_RTC_DRV_DS1390)  += rtc-ds1390.o
 obj-$(CONFIG_RTC_DRV_DS1511)   += rtc-ds1511.o
 obj-$(CONFIG_RTC_DRV_DS1553)   += rtc-ds1553.o
 obj-$(CONFIG_RTC_DRV_DS1672)   += rtc-ds1672.o
+obj-$(CONFIG_RTC_DRV_DS1685_FAMILY)    += rtc-ds1685.o
 obj-$(CONFIG_RTC_DRV_DS1742)   += rtc-ds1742.o
 obj-$(CONFIG_RTC_DRV_DS2404)    += rtc-ds2404.o
 obj-$(CONFIG_RTC_DRV_DS3232)   += rtc-ds3232.o
diff --git a/drivers/rtc/rtc-ds1685.c b/drivers/rtc/rtc-ds1685.c
new file mode 100644 (file)
index 0000000..8c3bfcb
--- /dev/null
@@ -0,0 +1,2252 @@
+/*
+ * An rtc driver for the Dallas/Maxim DS1685/DS1687 and related real-time
+ * chips.
+ *
+ * Copyright (C) 2011-2014 Joshua Kinard <kumba@gentoo.org>.
+ * Copyright (C) 2009 Matthias Fuchs <matthias.fuchs@esd-electronics.com>.
+ *
+ * References:
+ *    DS1685/DS1687 3V/5V Real-Time Clocks, 19-5215, Rev 4/10.
+ *    DS17x85/DS17x87 3V/5V Real-Time Clocks, 19-5222, Rev 4/10.
+ *    DS1689/DS1693 3V/5V Serialized Real-Time Clocks, Rev 112105.
+ *    Application Note 90, Using the Multiplex Bus RTC Extended Features.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/bcd.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/rtc.h>
+#include <linux/workqueue.h>
+
+#include <linux/rtc/ds1685.h>
+
+#ifdef CONFIG_PROC_FS
+#include <linux/proc_fs.h>
+#endif
+
+#define DRV_VERSION    "0.42.0"
+
+
+/* ----------------------------------------------------------------------- */
+/* Standard read/write functions if platform does not provide overrides */
+
+/**
+ * ds1685_read - read a value from an rtc register.
+ * @rtc: pointer to the ds1685 rtc structure.
+ * @reg: the register address to read.
+ */
+static u8
+ds1685_read(struct ds1685_priv *rtc, int reg)
+{
+       return readb((u8 __iomem *)rtc->regs +
+                    (reg * rtc->regstep));
+}
+
+/**
+ * ds1685_write - write a value to an rtc register.
+ * @rtc: pointer to the ds1685 rtc structure.
+ * @reg: the register address to write.
+ * @value: value to write to the register.
+ */
+static void
+ds1685_write(struct ds1685_priv *rtc, int reg, u8 value)
+{
+       writeb(value, ((u8 __iomem *)rtc->regs +
+                      (reg * rtc->regstep)));
+}
+/* ----------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- */
+/* Inlined functions */
+
+/**
+ * ds1685_rtc_bcd2bin - bcd2bin wrapper in case platform doesn't support BCD.
+ * @rtc: pointer to the ds1685 rtc structure.
+ * @val: u8 time value to consider converting.
+ * @bcd_mask: u8 mask value if BCD mode is used.
+ * @bin_mask: u8 mask value if BIN mode is used.
+ *
+ * Returns the value, converted to BIN if originally in BCD and bcd_mode TRUE.
+ */
+static inline u8
+ds1685_rtc_bcd2bin(struct ds1685_priv *rtc, u8 val, u8 bcd_mask, u8 bin_mask)
+{
+       if (rtc->bcd_mode)
+               return (bcd2bin(val) & bcd_mask);
+
+       return (val & bin_mask);
+}
+
+/**
+ * ds1685_rtc_bin2bcd - bin2bcd wrapper in case platform doesn't support BCD.
+ * @rtc: pointer to the ds1685 rtc structure.
+ * @val: u8 time value to consider converting.
+ * @bin_mask: u8 mask value if BIN mode is used.
+ * @bcd_mask: u8 mask value if BCD mode is used.
+ *
+ * Returns the value, converted to BCD if originally in BIN and bcd_mode TRUE.
+ */
+static inline u8
+ds1685_rtc_bin2bcd(struct ds1685_priv *rtc, u8 val, u8 bin_mask, u8 bcd_mask)
+{
+       if (rtc->bcd_mode)
+               return (bin2bcd(val) & bcd_mask);
+
+       return (val & bin_mask);
+}
+
+/**
+ * ds1685_rtc_switch_to_bank0 - switch the rtc to bank 0.
+ * @rtc: pointer to the ds1685 rtc structure.
+ */
+static inline void
+ds1685_rtc_switch_to_bank0(struct ds1685_priv *rtc)
+{
+       rtc->write(rtc, RTC_CTRL_A,
+                  (rtc->read(rtc, RTC_CTRL_A) & ~(RTC_CTRL_A_DV0)));
+}
+
+/**
+ * ds1685_rtc_switch_to_bank1 - switch the rtc to bank 1.
+ * @rtc: pointer to the ds1685 rtc structure.
+ */
+static inline void
+ds1685_rtc_switch_to_bank1(struct ds1685_priv *rtc)
+{
+       rtc->write(rtc, RTC_CTRL_A,
+                  (rtc->read(rtc, RTC_CTRL_A) | RTC_CTRL_A_DV0));
+}
+
+/**
+ * ds1685_rtc_begin_data_access - prepare the rtc for data access.
+ * @rtc: pointer to the ds1685 rtc structure.
+ *
+ * This takes several steps to prepare the rtc for access to get/set time
+ * and alarm values from the rtc registers:
+ *  - Sets the SET bit in Control Register B.
+ *  - Reads Ext Control Register 4A and checks the INCR bit.
+ *  - If INCR is active, a short delay is added before Ext Control Register 4A
+ *    is read again in a loop until INCR is inactive.
+ *  - Switches the rtc to bank 1.  This allows access to all relevant
+ *    data for normal rtc operation, as bank 0 contains only the nvram.
+ */
+static inline void
+ds1685_rtc_begin_data_access(struct ds1685_priv *rtc)
+{
+       /* Set the SET bit in Ctrl B */
+       rtc->write(rtc, RTC_CTRL_B,
+                  (rtc->read(rtc, RTC_CTRL_B) | RTC_CTRL_B_SET));
+
+       /* Read Ext Ctrl 4A and check the INCR bit to avoid a lockout. */
+       while (rtc->read(rtc, RTC_EXT_CTRL_4A) & RTC_CTRL_4A_INCR)
+               cpu_relax();
+
+       /* Switch to Bank 1 */
+       ds1685_rtc_switch_to_bank1(rtc);
+}
+
+/**
+ * ds1685_rtc_end_data_access - end data access on the rtc.
+ * @rtc: pointer to the ds1685 rtc structure.
+ *
+ * This ends what was started by ds1685_rtc_begin_data_access:
+ *  - Switches the rtc back to bank 0.
+ *  - Clears the SET bit in Control Register B.
+ */
+static inline void
+ds1685_rtc_end_data_access(struct ds1685_priv *rtc)
+{
+       /* Switch back to Bank 0 */
+       ds1685_rtc_switch_to_bank1(rtc);
+
+       /* Clear the SET bit in Ctrl B */
+       rtc->write(rtc, RTC_CTRL_B,
+                  (rtc->read(rtc, RTC_CTRL_B) & ~(RTC_CTRL_B_SET)));
+}
+
+/**
+ * ds1685_rtc_begin_ctrl_access - prepare the rtc for ctrl access.
+ * @rtc: pointer to the ds1685 rtc structure.
+ * @flags: irq flags variable for spin_lock_irqsave.
+ *
+ * This takes several steps to prepare the rtc for access to read just the
+ * control registers:
+ *  - Sets a spinlock on the rtc IRQ.
+ *  - Switches the rtc to bank 1.  This allows access to the two extended
+ *    control registers.
+ *
+ * Only use this where you are certain another lock will not be held.
+ */
+static inline void
+ds1685_rtc_begin_ctrl_access(struct ds1685_priv *rtc, unsigned long flags)
+{
+       spin_lock_irqsave(&rtc->lock, flags);
+       ds1685_rtc_switch_to_bank1(rtc);
+}
+
+/**
+ * ds1685_rtc_end_ctrl_access - end ctrl access on the rtc.
+ * @rtc: pointer to the ds1685 rtc structure.
+ * @flags: irq flags variable for spin_unlock_irqrestore.
+ *
+ * This ends what was started by ds1685_rtc_begin_ctrl_access:
+ *  - Switches the rtc back to bank 0.
+ *  - Unsets the spinlock on the rtc IRQ.
+ */
+static inline void
+ds1685_rtc_end_ctrl_access(struct ds1685_priv *rtc, unsigned long flags)
+{
+       ds1685_rtc_switch_to_bank0(rtc);
+       spin_unlock_irqrestore(&rtc->lock, flags);
+}
+
+/**
+ * ds1685_rtc_get_ssn - retrieve the silicon serial number.
+ * @rtc: pointer to the ds1685 rtc structure.
+ * @ssn: u8 array to hold the bits of the silicon serial number.
+ *
+ * This number starts at 0x40, and is 8-bytes long, ending at 0x47. The
+ * first byte is the model number, the next six bytes are the serial number
+ * digits, and the final byte is a CRC check byte.  Together, they form the
+ * silicon serial number.
+ *
+ * These values are stored in bank1, so ds1685_rtc_switch_to_bank1 must be
+ * called first before calling this function, else data will be read out of
+ * the bank0 NVRAM.  Be sure to call ds1685_rtc_switch_to_bank0 when done.
+ */
+static inline void
+ds1685_rtc_get_ssn(struct ds1685_priv *rtc, u8 *ssn)
+{
+       ssn[0] = rtc->read(rtc, RTC_BANK1_SSN_MODEL);
+       ssn[1] = rtc->read(rtc, RTC_BANK1_SSN_BYTE_1);
+       ssn[2] = rtc->read(rtc, RTC_BANK1_SSN_BYTE_2);
+       ssn[3] = rtc->read(rtc, RTC_BANK1_SSN_BYTE_3);
+       ssn[4] = rtc->read(rtc, RTC_BANK1_SSN_BYTE_4);
+       ssn[5] = rtc->read(rtc, RTC_BANK1_SSN_BYTE_5);
+       ssn[6] = rtc->read(rtc, RTC_BANK1_SSN_BYTE_6);
+       ssn[7] = rtc->read(rtc, RTC_BANK1_SSN_CRC);
+}
+/* ----------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- */
+/* Read/Set Time & Alarm functions */
+
+/**
+ * ds1685_rtc_read_time - reads the time registers.
+ * @dev: pointer to device structure.
+ * @tm: pointer to rtc_time structure.
+ */
+static int
+ds1685_rtc_read_time(struct device *dev, struct rtc_time *tm)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+       struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+       u8 ctrlb, century;
+       u8 seconds, minutes, hours, wday, mday, month, years;
+
+       /* Fetch the time info from the RTC registers. */
+       ds1685_rtc_begin_data_access(rtc);
+       seconds = rtc->read(rtc, RTC_SECS);
+       minutes = rtc->read(rtc, RTC_MINS);
+       hours   = rtc->read(rtc, RTC_HRS);
+       wday    = rtc->read(rtc, RTC_WDAY);
+       mday    = rtc->read(rtc, RTC_MDAY);
+       month   = rtc->read(rtc, RTC_MONTH);
+       years   = rtc->read(rtc, RTC_YEAR);
+       century = rtc->read(rtc, RTC_CENTURY);
+       ctrlb   = rtc->read(rtc, RTC_CTRL_B);
+       ds1685_rtc_end_data_access(rtc);
+
+       /* bcd2bin if needed, perform fixups, and store to rtc_time. */
+       years        = ds1685_rtc_bcd2bin(rtc, years, RTC_YEAR_BCD_MASK,
+                                         RTC_YEAR_BIN_MASK);
+       century      = ds1685_rtc_bcd2bin(rtc, century, RTC_CENTURY_MASK,
+                                         RTC_CENTURY_MASK);
+       tm->tm_sec   = ds1685_rtc_bcd2bin(rtc, seconds, RTC_SECS_BCD_MASK,
+                                         RTC_SECS_BIN_MASK);
+       tm->tm_min   = ds1685_rtc_bcd2bin(rtc, minutes, RTC_MINS_BCD_MASK,
+                                         RTC_MINS_BIN_MASK);
+       tm->tm_hour  = ds1685_rtc_bcd2bin(rtc, hours, RTC_HRS_24_BCD_MASK,
+                                         RTC_HRS_24_BIN_MASK);
+       tm->tm_wday  = (ds1685_rtc_bcd2bin(rtc, wday, RTC_WDAY_MASK,
+                                          RTC_WDAY_MASK) - 1);
+       tm->tm_mday  = ds1685_rtc_bcd2bin(rtc, mday, RTC_MDAY_BCD_MASK,
+                                         RTC_MDAY_BIN_MASK);
+       tm->tm_mon   = (ds1685_rtc_bcd2bin(rtc, month, RTC_MONTH_BCD_MASK,
+                                          RTC_MONTH_BIN_MASK) - 1);
+       tm->tm_year  = ((years + (century * 100)) - 1900);
+       tm->tm_yday  = rtc_year_days(tm->tm_mday, tm->tm_mon, tm->tm_year);
+       tm->tm_isdst = 0; /* RTC has hardcoded timezone, so don't use. */
+
+       return rtc_valid_tm(tm);
+}
+
+/**
+ * ds1685_rtc_set_time - sets the time registers.
+ * @dev: pointer to device structure.
+ * @tm: pointer to rtc_time structure.
+ */
+static int
+ds1685_rtc_set_time(struct device *dev, struct rtc_time *tm)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+       struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+       u8 ctrlb, seconds, minutes, hours, wday, mday, month, years, century;
+
+       /* Fetch the time info from rtc_time. */
+       seconds = ds1685_rtc_bin2bcd(rtc, tm->tm_sec, RTC_SECS_BIN_MASK,
+                                    RTC_SECS_BCD_MASK);
+       minutes = ds1685_rtc_bin2bcd(rtc, tm->tm_min, RTC_MINS_BIN_MASK,
+                                    RTC_MINS_BCD_MASK);
+       hours   = ds1685_rtc_bin2bcd(rtc, tm->tm_hour, RTC_HRS_24_BIN_MASK,
+                                    RTC_HRS_24_BCD_MASK);
+       wday    = ds1685_rtc_bin2bcd(rtc, (tm->tm_wday + 1), RTC_WDAY_MASK,
+                                    RTC_WDAY_MASK);
+       mday    = ds1685_rtc_bin2bcd(rtc, tm->tm_mday, RTC_MDAY_BIN_MASK,
+                                    RTC_MDAY_BCD_MASK);
+       month   = ds1685_rtc_bin2bcd(rtc, (tm->tm_mon + 1), RTC_MONTH_BIN_MASK,
+                                    RTC_MONTH_BCD_MASK);
+       years   = ds1685_rtc_bin2bcd(rtc, (tm->tm_year % 100),
+                                    RTC_YEAR_BIN_MASK, RTC_YEAR_BCD_MASK);
+       century = ds1685_rtc_bin2bcd(rtc, ((tm->tm_year + 1900) / 100),
+                                    RTC_CENTURY_MASK, RTC_CENTURY_MASK);
+
+       /*
+        * Perform Sanity Checks:
+        *   - Months: !> 12, Month Day != 0.
+        *   - Month Day !> Max days in current month.
+        *   - Hours !>= 24, Mins !>= 60, Secs !>= 60, & Weekday !> 7.
+        */
+       if ((tm->tm_mon > 11) || (mday == 0))
+               return -EDOM;
+
+       if (tm->tm_mday > rtc_month_days(tm->tm_mon, tm->tm_year))
+               return -EDOM;
+
+       if ((tm->tm_hour >= 24) || (tm->tm_min >= 60) ||
+           (tm->tm_sec >= 60)  || (wday > 7))
+               return -EDOM;
+
+       /*
+        * Set the data mode to use and store the time values in the
+        * RTC registers.
+        */
+       ds1685_rtc_begin_data_access(rtc);
+       ctrlb = rtc->read(rtc, RTC_CTRL_B);
+       if (rtc->bcd_mode)
+               ctrlb &= ~(RTC_CTRL_B_DM);
+       else
+               ctrlb |= RTC_CTRL_B_DM;
+       rtc->write(rtc, RTC_CTRL_B, ctrlb);
+       rtc->write(rtc, RTC_SECS, seconds);
+       rtc->write(rtc, RTC_MINS, minutes);
+       rtc->write(rtc, RTC_HRS, hours);
+       rtc->write(rtc, RTC_WDAY, wday);
+       rtc->write(rtc, RTC_MDAY, mday);
+       rtc->write(rtc, RTC_MONTH, month);
+       rtc->write(rtc, RTC_YEAR, years);
+       rtc->write(rtc, RTC_CENTURY, century);
+       ds1685_rtc_end_data_access(rtc);
+
+       return 0;
+}
+
+/**
+ * ds1685_rtc_read_alarm - reads the alarm registers.
+ * @dev: pointer to device structure.
+ * @alrm: pointer to rtc_wkalrm structure.
+ *
+ * There are three primary alarm registers: seconds, minutes, and hours.
+ * A fourth alarm register for the month date is also available in bank1 for
+ * kickstart/wakeup features.  The DS1685/DS1687 manual states that a
+ * "don't care" value ranging from 0xc0 to 0xff may be written into one or
+ * more of the three alarm bytes to act as a wildcard value.  The fourth
+ * byte doesn't support a "don't care" value.
+ */
+static int
+ds1685_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+       struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+       u8 seconds, minutes, hours, mday, ctrlb, ctrlc;
+
+       /* Fetch the alarm info from the RTC alarm registers. */
+       ds1685_rtc_begin_data_access(rtc);
+       seconds = rtc->read(rtc, RTC_SECS_ALARM);
+       minutes = rtc->read(rtc, RTC_MINS_ALARM);
+       hours   = rtc->read(rtc, RTC_HRS_ALARM);
+       mday    = rtc->read(rtc, RTC_MDAY_ALARM);
+       ctrlb   = rtc->read(rtc, RTC_CTRL_B);
+       ctrlc   = rtc->read(rtc, RTC_CTRL_C);
+       ds1685_rtc_end_data_access(rtc);
+
+       /* Check month date. */
+       if (!(mday >= 1) && (mday <= 31))
+               return -EDOM;
+
+       /*
+        * Check the three alarm bytes.
+        *
+        * The Linux RTC system doesn't support the "don't care" capability
+        * of this RTC chip.  We check for it anyways in case support is
+        * added in the future.
+        */
+       if (unlikely((seconds >= 0xc0) && (seconds <= 0xff)))
+               alrm->time.tm_sec = -1;
+       else
+               alrm->time.tm_sec = ds1685_rtc_bcd2bin(rtc, seconds,
+                                                      RTC_SECS_BCD_MASK,
+                                                      RTC_SECS_BIN_MASK);
+
+       if (unlikely((minutes >= 0xc0) && (minutes <= 0xff)))
+               alrm->time.tm_min = -1;
+       else
+               alrm->time.tm_min = ds1685_rtc_bcd2bin(rtc, minutes,
+                                                      RTC_MINS_BCD_MASK,
+                                                      RTC_MINS_BIN_MASK);
+
+       if (unlikely((hours >= 0xc0) && (hours <= 0xff)))
+               alrm->time.tm_hour = -1;
+       else
+               alrm->time.tm_hour = ds1685_rtc_bcd2bin(rtc, hours,
+                                                       RTC_HRS_24_BCD_MASK,
+                                                       RTC_HRS_24_BIN_MASK);
+
+       /* Write the data to rtc_wkalrm. */
+       alrm->time.tm_mday = ds1685_rtc_bcd2bin(rtc, mday, RTC_MDAY_BCD_MASK,
+                                               RTC_MDAY_BIN_MASK);
+       alrm->time.tm_mon = -1;
+       alrm->time.tm_year = -1;
+       alrm->time.tm_wday = -1;
+       alrm->time.tm_yday = -1;
+       alrm->time.tm_isdst = -1;
+       alrm->enabled = !!(ctrlb & RTC_CTRL_B_AIE);
+       alrm->pending = !!(ctrlc & RTC_CTRL_C_AF);
+
+       return 0;
+}
+
+/**
+ * ds1685_rtc_set_alarm - sets the alarm in registers.
+ * @dev: pointer to device structure.
+ * @alrm: pointer to rtc_wkalrm structure.
+ */
+static int
+ds1685_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+       struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+       u8 ctrlb, seconds, minutes, hours, mday;
+
+       /* Fetch the alarm info and convert to BCD. */
+       seconds = ds1685_rtc_bin2bcd(rtc, alrm->time.tm_sec,
+                                    RTC_SECS_BIN_MASK,
+                                    RTC_SECS_BCD_MASK);
+       minutes = ds1685_rtc_bin2bcd(rtc, alrm->time.tm_min,
+                                    RTC_MINS_BIN_MASK,
+                                    RTC_MINS_BCD_MASK);
+       hours   = ds1685_rtc_bin2bcd(rtc, alrm->time.tm_hour,
+                                    RTC_HRS_24_BIN_MASK,
+                                    RTC_HRS_24_BCD_MASK);
+       mday    = ds1685_rtc_bin2bcd(rtc, alrm->time.tm_mday,
+                                    RTC_MDAY_BIN_MASK,
+                                    RTC_MDAY_BCD_MASK);
+
+       /* Check the month date for validity. */
+       if (!(mday >= 1) && (mday <= 31))
+               return -EDOM;
+
+       /*
+        * Check the three alarm bytes.
+        *
+        * The Linux RTC system doesn't support the "don't care" capability
+        * of this RTC chip because rtc_valid_tm tries to validate every
+        * field, and we only support four fields.  We put the support
+        * here anyways for the future.
+        */
+       if (unlikely((seconds >= 0xc0) && (seconds <= 0xff)))
+               seconds = 0xff;
+
+       if (unlikely((minutes >= 0xc0) && (minutes <= 0xff)))
+               minutes = 0xff;
+
+       if (unlikely((hours >= 0xc0) && (hours <= 0xff)))
+               hours = 0xff;
+
+       alrm->time.tm_mon       = -1;
+       alrm->time.tm_year      = -1;
+       alrm->time.tm_wday      = -1;
+       alrm->time.tm_yday      = -1;
+       alrm->time.tm_isdst     = -1;
+
+       /* Disable the alarm interrupt first. */
+       ds1685_rtc_begin_data_access(rtc);
+       ctrlb = rtc->read(rtc, RTC_CTRL_B);
+       rtc->write(rtc, RTC_CTRL_B, (ctrlb & ~(RTC_CTRL_B_AIE)));
+
+       /* Read ctrlc to clear RTC_CTRL_C_AF. */
+       rtc->read(rtc, RTC_CTRL_C);
+
+       /*
+        * Set the data mode to use and store the time values in the
+        * RTC registers.
+        */
+       ctrlb = rtc->read(rtc, RTC_CTRL_B);
+       if (rtc->bcd_mode)
+               ctrlb &= ~(RTC_CTRL_B_DM);
+       else
+               ctrlb |= RTC_CTRL_B_DM;
+       rtc->write(rtc, RTC_CTRL_B, ctrlb);
+       rtc->write(rtc, RTC_SECS_ALARM, seconds);
+       rtc->write(rtc, RTC_MINS_ALARM, minutes);
+       rtc->write(rtc, RTC_HRS_ALARM, hours);
+       rtc->write(rtc, RTC_MDAY_ALARM, mday);
+
+       /* Re-enable the alarm if needed. */
+       if (alrm->enabled) {
+               ctrlb = rtc->read(rtc, RTC_CTRL_B);
+               ctrlb |= RTC_CTRL_B_AIE;
+               rtc->write(rtc, RTC_CTRL_B, ctrlb);
+       }
+
+       /* Done! */
+       ds1685_rtc_end_data_access(rtc);
+
+       return 0;
+}
+/* ----------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- */
+/* /dev/rtcX Interface functions */
+
+#ifdef CONFIG_RTC_INTF_DEV
+/**
+ * ds1685_rtc_alarm_irq_enable - replaces ioctl() RTC_AIE on/off.
+ * @dev: pointer to device structure.
+ * @enabled: flag indicating whether to enable or disable.
+ */
+static int
+ds1685_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
+{
+       struct ds1685_priv *rtc = dev_get_drvdata(dev);
+       unsigned long flags = 0;
+
+       /* Enable/disable the Alarm IRQ-Enable flag. */
+       spin_lock_irqsave(&rtc->lock, flags);
+
+       /* Flip the requisite interrupt-enable bit. */
+       if (enabled)
+               rtc->write(rtc, RTC_CTRL_B, (rtc->read(rtc, RTC_CTRL_B) |
+                                            RTC_CTRL_B_AIE));
+       else
+               rtc->write(rtc, RTC_CTRL_B, (rtc->read(rtc, RTC_CTRL_B) &
+                                            ~(RTC_CTRL_B_AIE)));
+
+       /* Read Control C to clear all the flag bits. */
+       rtc->read(rtc, RTC_CTRL_C);
+       spin_unlock_irqrestore(&rtc->lock, flags);
+
+       return 0;
+}
+#endif
+/* ----------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- */
+/* IRQ handler & workqueue. */
+
+/**
+ * ds1685_rtc_irq_handler - IRQ handler.
+ * @irq: IRQ number.
+ * @dev_id: platform device pointer.
+ */
+static irqreturn_t
+ds1685_rtc_irq_handler(int irq, void *dev_id)
+{
+       struct platform_device *pdev = dev_id;
+       struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+       u8 ctrlb, ctrlc;
+       unsigned long events = 0;
+       u8 num_irqs = 0;
+
+       /* Abort early if the device isn't ready yet (i.e., DEBUG_SHIRQ). */
+       if (unlikely(!rtc))
+               return IRQ_HANDLED;
+
+       /* Ctrlb holds the interrupt-enable bits and ctrlc the flag bits. */
+       spin_lock(&rtc->lock);
+       ctrlb = rtc->read(rtc, RTC_CTRL_B);
+       ctrlc = rtc->read(rtc, RTC_CTRL_C);
+
+       /* Is the IRQF bit set? */
+       if (likely(ctrlc & RTC_CTRL_C_IRQF)) {
+               /*
+                * We need to determine if it was one of the standard
+                * events: PF, AF, or UF.  If so, we handle them and
+                * update the RTC core.
+                */
+               if (likely(ctrlc & RTC_CTRL_B_PAU_MASK)) {
+                       events = RTC_IRQF;
+
+                       /* Check for a periodic interrupt. */
+                       if ((ctrlb & RTC_CTRL_B_PIE) &&
+                           (ctrlc & RTC_CTRL_C_PF)) {
+                               events |= RTC_PF;
+                               num_irqs++;
+                       }
+
+                       /* Check for an alarm interrupt. */
+                       if ((ctrlb & RTC_CTRL_B_AIE) &&
+                           (ctrlc & RTC_CTRL_C_AF)) {
+                               events |= RTC_AF;
+                               num_irqs++;
+                       }
+
+                       /* Check for an update interrupt. */
+                       if ((ctrlb & RTC_CTRL_B_UIE) &&
+                           (ctrlc & RTC_CTRL_C_UF)) {
+                               events |= RTC_UF;
+                               num_irqs++;
+                       }
+
+                       rtc_update_irq(rtc->dev, num_irqs, events);
+               } else {
+                       /*
+                        * One of the "extended" interrupts was received that
+                        * is not recognized by the RTC core.  These need to
+                        * be handled in task context as they can call other
+                        * functions and the time spent in irq context needs
+                        * to be minimized.  Schedule them into a workqueue
+                        * and inform the RTC core that the IRQs were handled.
+                        */
+                       spin_unlock(&rtc->lock);
+                       schedule_work(&rtc->work);
+                       rtc_update_irq(rtc->dev, 0, 0);
+                       return IRQ_HANDLED;
+               }
+       }
+       spin_unlock(&rtc->lock);
+
+       return events ? IRQ_HANDLED : IRQ_NONE;
+}
+
+/**
+ * ds1685_rtc_work_queue - work queue handler.
+ * @work: work_struct containing data to work on in task context.
+ */
+static void
+ds1685_rtc_work_queue(struct work_struct *work)
+{
+       struct ds1685_priv *rtc = container_of(work,
+                                              struct ds1685_priv, work);
+       struct platform_device *pdev = to_platform_device(&rtc->dev->dev);
+       struct mutex *rtc_mutex = &rtc->dev->ops_lock;
+       u8 ctrl4a, ctrl4b;
+
+       mutex_lock(rtc_mutex);
+
+       ds1685_rtc_switch_to_bank1(rtc);
+       ctrl4a = rtc->read(rtc, RTC_EXT_CTRL_4A);
+       ctrl4b = rtc->read(rtc, RTC_EXT_CTRL_4B);
+
+       /*
+        * Check for a kickstart interrupt. With Vcc applied, this
+        * typically means that the power button was pressed, so we
+        * begin the shutdown sequence.
+        */
+       if ((ctrl4b & RTC_CTRL_4B_KSE) && (ctrl4a & RTC_CTRL_4A_KF)) {
+               /* Briefly disable kickstarts to debounce button presses. */
+               rtc->write(rtc, RTC_EXT_CTRL_4B,
+                          (rtc->read(rtc, RTC_EXT_CTRL_4B) &
+                           ~(RTC_CTRL_4B_KSE)));
+
+               /* Clear the kickstart flag. */
+               rtc->write(rtc, RTC_EXT_CTRL_4A,
+                          (ctrl4a & ~(RTC_CTRL_4A_KF)));
+
+
+               /*
+                * Sleep 500ms before re-enabling kickstarts.  This allows
+                * adequate time to avoid reading signal jitter as additional
+                * button presses.
+                */
+               msleep(500);
+               rtc->write(rtc, RTC_EXT_CTRL_4B,
+                          (rtc->read(rtc, RTC_EXT_CTRL_4B) |
+                           RTC_CTRL_4B_KSE));
+
+               /* Call the platform pre-poweroff function. Else, shutdown. */
+               if (rtc->prepare_poweroff != NULL)
+                       rtc->prepare_poweroff();
+               else
+                       ds1685_rtc_poweroff(pdev);
+       }
+
+       /*
+        * Check for a wake-up interrupt.  With Vcc applied, this is
+        * essentially a second alarm interrupt, except it takes into
+        * account the 'date' register in bank1 in addition to the
+        * standard three alarm registers.
+        */
+       if ((ctrl4b & RTC_CTRL_4B_WIE) && (ctrl4a & RTC_CTRL_4A_WF)) {
+               rtc->write(rtc, RTC_EXT_CTRL_4A,
+                          (ctrl4a & ~(RTC_CTRL_4A_WF)));
+
+               /* Call the platform wake_alarm function if defined. */
+               if (rtc->wake_alarm != NULL)
+                       rtc->wake_alarm();
+               else
+                       dev_warn(&pdev->dev,
+                                "Wake Alarm IRQ just occurred!\n");
+       }
+
+       /*
+        * Check for a ram-clear interrupt.  This happens if RIE=1 and RF=0
+        * when RCE=1 in 4B.  This clears all NVRAM bytes in bank0 by setting
+        * each byte to a logic 1.  This has no effect on any extended
+        * NV-SRAM that might be present, nor on the time/calendar/alarm
+        * registers.  After a ram-clear is completed, there is a minimum
+        * recovery time of ~150ms in which all reads/writes are locked out.
+        * NOTE: A ram-clear can still occur if RCE=1 and RIE=0.  We cannot
+        * catch this scenario.
+        */
+       if ((ctrl4b & RTC_CTRL_4B_RIE) && (ctrl4a & RTC_CTRL_4A_RF)) {
+               rtc->write(rtc, RTC_EXT_CTRL_4A,
+                          (ctrl4a & ~(RTC_CTRL_4A_RF)));
+               msleep(150);
+
+               /* Call the platform post_ram_clear function if defined. */
+               if (rtc->post_ram_clear != NULL)
+                       rtc->post_ram_clear();
+               else
+                       dev_warn(&pdev->dev,
+                                "RAM-Clear IRQ just occurred!\n");
+       }
+       ds1685_rtc_switch_to_bank0(rtc);
+
+       mutex_unlock(rtc_mutex);
+}
+/* ----------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- */
+/* ProcFS interface */
+
+#ifdef CONFIG_PROC_FS
+#define NUM_REGS       6       /* Num of control registers. */
+#define NUM_BITS       8       /* Num bits per register. */
+#define NUM_SPACES     4       /* Num spaces between each bit. */
+
+/*
+ * Periodic Interrupt Rates.
+ */
+static const char *ds1685_rtc_pirq_rate[16] = {
+       "none", "3.90625ms", "7.8125ms", "0.122070ms", "0.244141ms",
+       "0.488281ms", "0.9765625ms", "1.953125ms", "3.90625ms", "7.8125ms",
+       "15.625ms", "31.25ms", "62.5ms", "125ms", "250ms", "500ms"
+};
+
+/*
+ * Square-Wave Output Frequencies.
+ */
+static const char *ds1685_rtc_sqw_freq[16] = {
+       "none", "256Hz", "128Hz", "8192Hz", "4096Hz", "2048Hz", "1024Hz",
+       "512Hz", "256Hz", "128Hz", "64Hz", "32Hz", "16Hz", "8Hz", "4Hz", "2Hz"
+};
+
+#ifdef CONFIG_RTC_DS1685_PROC_REGS
+/**
+ * ds1685_rtc_print_regs - helper function to print register values.
+ * @hex: hex byte to convert into binary bits.
+ * @dest: destination char array.
+ *
+ * This is basically a hex->binary function, just with extra spacing between
+ * the digits.  It only works on 1-byte values (8 bits).
+ */
+static char*
+ds1685_rtc_print_regs(u8 hex, char *dest)
+{
+       u32 i, j;
+       char *tmp = dest;
+
+       for (i = 0; i < NUM_BITS; i++) {
+               *tmp++ = ((hex & 0x80) != 0 ? '1' : '0');
+               for (j = 0; j < NUM_SPACES; j++)
+                       *tmp++ = ' ';
+               hex <<= 1;
+       }
+       *tmp++ = '\0';
+
+       return dest;
+}
+#endif
+
+/**
+ * ds1685_rtc_proc - procfs access function.
+ * @dev: pointer to device structure.
+ * @seq: pointer to seq_file structure.
+ */
+static int
+ds1685_rtc_proc(struct device *dev, struct seq_file *seq)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+       struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+       u8 ctrla, ctrlb, ctrlc, ctrld, ctrl4a, ctrl4b, ssn[8];
+       char *model = '\0';
+#ifdef CONFIG_RTC_DS1685_PROC_REGS
+       char bits[NUM_REGS][(NUM_BITS * NUM_SPACES) + NUM_BITS + 1];
+#endif
+
+       /* Read all the relevant data from the control registers. */
+       ds1685_rtc_switch_to_bank1(rtc);
+       ds1685_rtc_get_ssn(rtc, ssn);
+       ctrla = rtc->read(rtc, RTC_CTRL_A);
+       ctrlb = rtc->read(rtc, RTC_CTRL_B);
+       ctrlc = rtc->read(rtc, RTC_CTRL_C);
+       ctrld = rtc->read(rtc, RTC_CTRL_D);
+       ctrl4a = rtc->read(rtc, RTC_EXT_CTRL_4A);
+       ctrl4b = rtc->read(rtc, RTC_EXT_CTRL_4B);
+       ds1685_rtc_switch_to_bank0(rtc);
+
+       /* Determine the RTC model. */
+       switch (ssn[0]) {
+       case RTC_MODEL_DS1685:
+               model = "DS1685/DS1687\0";
+               break;
+       case RTC_MODEL_DS1689:
+               model = "DS1689/DS1693\0";
+               break;
+       case RTC_MODEL_DS17285:
+               model = "DS17285/DS17287\0";
+               break;
+       case RTC_MODEL_DS17485:
+               model = "DS17485/DS17487\0";
+               break;
+       case RTC_MODEL_DS17885:
+               model = "DS17885/DS17887\0";
+               break;
+       default:
+               model = "Unknown\0";
+               break;
+       }
+
+       /* Print out the information. */
+       seq_printf(seq,
+          "Model\t\t: %s\n"
+          "Oscillator\t: %s\n"
+          "12/24hr\t\t: %s\n"
+          "DST\t\t: %s\n"
+          "Data mode\t: %s\n"
+          "Battery\t\t: %s\n"
+          "Aux batt\t: %s\n"
+          "Update IRQ\t: %s\n"
+          "Periodic IRQ\t: %s\n"
+          "Periodic Rate\t: %s\n"
+          "SQW Freq\t: %s\n"
+#ifdef CONFIG_RTC_DS1685_PROC_REGS
+          "Serial #\t: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n"
+          "Register Status\t:\n"
+          "   Ctrl A\t: UIP  DV2  DV1  DV0  RS3  RS2  RS1  RS0\n"
+          "\t\t:  %s\n"
+          "   Ctrl B\t: SET  PIE  AIE  UIE  SQWE  DM  2412 DSE\n"
+          "\t\t:  %s\n"
+          "   Ctrl C\t: IRQF  PF   AF   UF  ---  ---  ---  ---\n"
+          "\t\t:  %s\n"
+          "   Ctrl D\t: VRT  ---  ---  ---  ---  ---  ---  ---\n"
+          "\t\t:  %s\n"
+#if !defined(CONFIG_RTC_DRV_DS1685) && !defined(CONFIG_RTC_DRV_DS1689)
+          "   Ctrl 4A\t: VRT2 INCR BME  ---  PAB   RF   WF   KF\n"
+#else
+          "   Ctrl 4A\t: VRT2 INCR ---  ---  PAB   RF   WF   KF\n"
+#endif
+          "\t\t:  %s\n"
+          "   Ctrl 4B\t: ABE  E32k  CS  RCE  PRS  RIE  WIE  KSE\n"
+          "\t\t:  %s\n",
+#else
+          "Serial #\t: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
+#endif
+          model,
+          ((ctrla & RTC_CTRL_A_DV1) ? "enabled" : "disabled"),
+          ((ctrlb & RTC_CTRL_B_2412) ? "24-hour" : "12-hour"),
+          ((ctrlb & RTC_CTRL_B_DSE) ? "enabled" : "disabled"),
+          ((ctrlb & RTC_CTRL_B_DM) ? "binary" : "BCD"),
+          ((ctrld & RTC_CTRL_D_VRT) ? "ok" : "exhausted or n/a"),
+          ((ctrl4a & RTC_CTRL_4A_VRT2) ? "ok" : "exhausted or n/a"),
+          ((ctrlb & RTC_CTRL_B_UIE) ? "yes" : "no"),
+          ((ctrlb & RTC_CTRL_B_PIE) ? "yes" : "no"),
+          (!(ctrl4b & RTC_CTRL_4B_E32K) ?
+           ds1685_rtc_pirq_rate[(ctrla & RTC_CTRL_A_RS_MASK)] : "none"),
+          (!((ctrl4b & RTC_CTRL_4B_E32K)) ?
+           ds1685_rtc_sqw_freq[(ctrla & RTC_CTRL_A_RS_MASK)] : "32768Hz"),
+#ifdef CONFIG_RTC_DS1685_PROC_REGS
+          ssn[0], ssn[1], ssn[2], ssn[3], ssn[4], ssn[5], ssn[6], ssn[7],
+          ds1685_rtc_print_regs(ctrla, bits[0]),
+          ds1685_rtc_print_regs(ctrlb, bits[1]),
+          ds1685_rtc_print_regs(ctrlc, bits[2]),
+          ds1685_rtc_print_regs(ctrld, bits[3]),
+          ds1685_rtc_print_regs(ctrl4a, bits[4]),
+          ds1685_rtc_print_regs(ctrl4b, bits[5]));
+#else
+          ssn[0], ssn[1], ssn[2], ssn[3], ssn[4], ssn[5], ssn[6], ssn[7]);
+#endif
+       return 0;
+}
+#else
+#define ds1685_rtc_proc NULL
+#endif /* CONFIG_PROC_FS */
+/* ----------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- */
+/* RTC Class operations */
+
+static const struct rtc_class_ops
+ds1685_rtc_ops = {
+       .proc = ds1685_rtc_proc,
+       .read_time = ds1685_rtc_read_time,
+       .set_time = ds1685_rtc_set_time,
+       .read_alarm = ds1685_rtc_read_alarm,
+       .set_alarm = ds1685_rtc_set_alarm,
+       .alarm_irq_enable = ds1685_rtc_alarm_irq_enable,
+};
+/* ----------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- */
+/* SysFS interface */
+
+#ifdef CONFIG_SYSFS
+/**
+ * ds1685_rtc_sysfs_nvram_read - reads rtc nvram via sysfs.
+ * @file: pointer to file structure.
+ * @kobj: pointer to kobject structure.
+ * @bin_attr: pointer to bin_attribute structure.
+ * @buf: pointer to char array to hold the output.
+ * @pos: current file position pointer.
+ * @size: size of the data to read.
+ */
+static ssize_t
+ds1685_rtc_sysfs_nvram_read(struct file *filp, struct kobject *kobj,
+                           struct bin_attribute *bin_attr, char *buf,
+                           loff_t pos, size_t size)
+{
+       struct platform_device *pdev =
+               to_platform_device(container_of(kobj, struct device, kobj));
+       struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+       ssize_t count;
+       unsigned long flags = 0;
+
+       spin_lock_irqsave(&rtc->lock, flags);
+       ds1685_rtc_switch_to_bank0(rtc);
+
+       /* Read NVRAM in time and bank0 registers. */
+       for (count = 0; size > 0 && pos < NVRAM_TOTAL_SZ_BANK0;
+            count++, size--) {
+               if (count < NVRAM_SZ_TIME)
+                       *buf++ = rtc->read(rtc, (NVRAM_TIME_BASE + pos++));
+               else
+                       *buf++ = rtc->read(rtc, (NVRAM_BANK0_BASE + pos++));
+       }
+
+#ifndef CONFIG_RTC_DRV_DS1689
+       if (size > 0) {
+               ds1685_rtc_switch_to_bank1(rtc);
+
+#ifndef CONFIG_RTC_DRV_DS1685
+               /* Enable burst-mode on DS17x85/DS17x87 */
+               rtc->write(rtc, RTC_EXT_CTRL_4A,
+                          (rtc->read(rtc, RTC_EXT_CTRL_4A) |
+                           RTC_CTRL_4A_BME));
+
+               /* We need one write to RTC_BANK1_RAM_ADDR_LSB to start
+                * reading with burst-mode */
+               rtc->write(rtc, RTC_BANK1_RAM_ADDR_LSB,
+                          (pos - NVRAM_TOTAL_SZ_BANK0));
+#endif
+
+               /* Read NVRAM in bank1 registers. */
+               for (count = 0; size > 0 && pos < NVRAM_TOTAL_SZ;
+                    count++, size--) {
+#ifdef CONFIG_RTC_DRV_DS1685
+                       /* DS1685/DS1687 has to write to RTC_BANK1_RAM_ADDR
+                        * before each read. */
+                       rtc->write(rtc, RTC_BANK1_RAM_ADDR,
+                                  (pos - NVRAM_TOTAL_SZ_BANK0));
+#endif
+                       *buf++ = rtc->read(rtc, RTC_BANK1_RAM_DATA_PORT);
+                       pos++;
+               }
+
+#ifndef CONFIG_RTC_DRV_DS1685
+               /* Disable burst-mode on DS17x85/DS17x87 */
+               rtc->write(rtc, RTC_EXT_CTRL_4A,
+                          (rtc->read(rtc, RTC_EXT_CTRL_4A) &
+                           ~(RTC_CTRL_4A_BME)));
+#endif
+               ds1685_rtc_switch_to_bank0(rtc);
+       }
+#endif /* !CONFIG_RTC_DRV_DS1689 */
+       spin_unlock_irqrestore(&rtc->lock, flags);
+
+       /*
+        * XXX: Bug? this appears to cause the function to get executed
+        * several times in succession.  But it's the only way to actually get
+        * data written out to a file.
+        */
+       return count;
+}
+
+/**
+ * ds1685_rtc_sysfs_nvram_write - writes rtc nvram via sysfs.
+ * @file: pointer to file structure.
+ * @kobj: pointer to kobject structure.
+ * @bin_attr: pointer to bin_attribute structure.
+ * @buf: pointer to char array to hold the input.
+ * @pos: current file position pointer.
+ * @size: size of the data to write.
+ */
+static ssize_t
+ds1685_rtc_sysfs_nvram_write(struct file *filp, struct kobject *kobj,
+                            struct bin_attribute *bin_attr, char *buf,
+                            loff_t pos, size_t size)
+{
+       struct platform_device *pdev =
+               to_platform_device(container_of(kobj, struct device, kobj));
+       struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+       ssize_t count;
+       unsigned long flags = 0;
+
+       spin_lock_irqsave(&rtc->lock, flags);
+       ds1685_rtc_switch_to_bank0(rtc);
+
+       /* Write NVRAM in time and bank0 registers. */
+       for (count = 0; size > 0 && pos < NVRAM_TOTAL_SZ_BANK0;
+            count++, size--)
+               if (count < NVRAM_SZ_TIME)
+                       rtc->write(rtc, (NVRAM_TIME_BASE + pos++),
+                                  *buf++);
+               else
+                       rtc->write(rtc, (NVRAM_BANK0_BASE), *buf++);
+
+#ifndef CONFIG_RTC_DRV_DS1689
+       if (size > 0) {
+               ds1685_rtc_switch_to_bank1(rtc);
+
+#ifndef CONFIG_RTC_DRV_DS1685
+               /* Enable burst-mode on DS17x85/DS17x87 */
+               rtc->write(rtc, RTC_EXT_CTRL_4A,
+                          (rtc->read(rtc, RTC_EXT_CTRL_4A) |
+                           RTC_CTRL_4A_BME));
+
+               /* We need one write to RTC_BANK1_RAM_ADDR_LSB to start
+                * writing with burst-mode */
+               rtc->write(rtc, RTC_BANK1_RAM_ADDR_LSB,
+                          (pos - NVRAM_TOTAL_SZ_BANK0));
+#endif
+
+               /* Write NVRAM in bank1 registers. */
+               for (count = 0; size > 0 && pos < NVRAM_TOTAL_SZ;
+                    count++, size--) {
+#ifdef CONFIG_RTC_DRV_DS1685
+                       /* DS1685/DS1687 has to write to RTC_BANK1_RAM_ADDR
+                        * before each read. */
+                       rtc->write(rtc, RTC_BANK1_RAM_ADDR,
+                                  (pos - NVRAM_TOTAL_SZ_BANK0));
+#endif
+                       rtc->write(rtc, RTC_BANK1_RAM_DATA_PORT, *buf++);
+                       pos++;
+               }
+
+#ifndef CONFIG_RTC_DRV_DS1685
+               /* Disable burst-mode on DS17x85/DS17x87 */
+               rtc->write(rtc, RTC_EXT_CTRL_4A,
+                          (rtc->read(rtc, RTC_EXT_CTRL_4A) &
+                           ~(RTC_CTRL_4A_BME)));
+#endif
+               ds1685_rtc_switch_to_bank0(rtc);
+       }
+#endif /* !CONFIG_RTC_DRV_DS1689 */
+       spin_unlock_irqrestore(&rtc->lock, flags);
+
+       return count;
+}
+
+/**
+ * struct ds1685_rtc_sysfs_nvram_attr - sysfs attributes for rtc nvram.
+ * @attr: nvram attributes.
+ * @read: nvram read function.
+ * @write: nvram write function.
+ * @size: nvram total size (bank0 + extended).
+ */
+static struct bin_attribute
+ds1685_rtc_sysfs_nvram_attr = {
+       .attr = {
+               .name = "nvram",
+               .mode = S_IRUGO | S_IWUSR,
+       },
+       .read = ds1685_rtc_sysfs_nvram_read,
+       .write = ds1685_rtc_sysfs_nvram_write,
+       .size = NVRAM_TOTAL_SZ
+};
+
+/**
+ * ds1685_rtc_sysfs_battery_show - sysfs file for main battery status.
+ * @dev: pointer to device structure.
+ * @attr: pointer to device_attribute structure.
+ * @buf: pointer to char array to hold the output.
+ */
+static ssize_t
+ds1685_rtc_sysfs_battery_show(struct device *dev,
+                             struct device_attribute *attr, char *buf)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+       struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+       u8 ctrld;
+
+       ctrld = rtc->read(rtc, RTC_CTRL_D);
+
+       return snprintf(buf, 13, "%s\n",
+                       (ctrld & RTC_CTRL_D_VRT) ? "ok" : "not ok or N/A");
+}
+static DEVICE_ATTR(battery, S_IRUGO, ds1685_rtc_sysfs_battery_show, NULL);
+
+/**
+ * ds1685_rtc_sysfs_auxbatt_show - sysfs file for aux battery status.
+ * @dev: pointer to device structure.
+ * @attr: pointer to device_attribute structure.
+ * @buf: pointer to char array to hold the output.
+ */
+static ssize_t
+ds1685_rtc_sysfs_auxbatt_show(struct device *dev,
+                             struct device_attribute *attr, char *buf)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+       struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+       u8 ctrl4a;
+
+       ds1685_rtc_switch_to_bank1(rtc);
+       ctrl4a = rtc->read(rtc, RTC_EXT_CTRL_4A);
+       ds1685_rtc_switch_to_bank0(rtc);
+
+       return snprintf(buf, 13, "%s\n",
+                       (ctrl4a & RTC_CTRL_4A_VRT2) ? "ok" : "not ok or N/A");
+}
+static DEVICE_ATTR(auxbatt, S_IRUGO, ds1685_rtc_sysfs_auxbatt_show, NULL);
+
+/**
+ * ds1685_rtc_sysfs_serial_show - sysfs file for silicon serial number.
+ * @dev: pointer to device structure.
+ * @attr: pointer to device_attribute structure.
+ * @buf: pointer to char array to hold the output.
+ */
+static ssize_t
+ds1685_rtc_sysfs_serial_show(struct device *dev,
+                            struct device_attribute *attr, char *buf)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+       struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+       u8 ssn[8];
+
+       ds1685_rtc_switch_to_bank1(rtc);
+       ds1685_rtc_get_ssn(rtc, ssn);
+       ds1685_rtc_switch_to_bank0(rtc);
+
+       return snprintf(buf, 24, "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
+                       ssn[0], ssn[1], ssn[2], ssn[3], ssn[4], ssn[5],
+                       ssn[6], ssn[7]);
+
+       return 0;
+}
+static DEVICE_ATTR(serial, S_IRUGO, ds1685_rtc_sysfs_serial_show, NULL);
+
+/**
+ * struct ds1685_rtc_sysfs_misc_attrs - list for misc RTC features.
+ */
+static struct attribute*
+ds1685_rtc_sysfs_misc_attrs[] = {
+       &dev_attr_battery.attr,
+       &dev_attr_auxbatt.attr,
+       &dev_attr_serial.attr,
+       NULL,
+};
+
+/**
+ * struct ds1685_rtc_sysfs_misc_grp - attr group for misc RTC features.
+ */
+static const struct attribute_group
+ds1685_rtc_sysfs_misc_grp = {
+       .name = "misc",
+       .attrs = ds1685_rtc_sysfs_misc_attrs,
+};
+
+#ifdef CONFIG_RTC_DS1685_SYSFS_REGS
+/**
+ * struct ds1685_rtc_ctrl_regs.
+ * @name: char pointer for the bit name.
+ * @reg: control register the bit is in.
+ * @bit: the bit's offset in the register.
+ */
+struct ds1685_rtc_ctrl_regs {
+       const char *name;
+       const u8 reg;
+       const u8 bit;
+};
+
+/*
+ * Ctrl register bit lookup table.
+ */
+static const struct ds1685_rtc_ctrl_regs
+ds1685_ctrl_regs_table[] = {
+       { "uip",  RTC_CTRL_A,      RTC_CTRL_A_UIP   },
+       { "dv2",  RTC_CTRL_A,      RTC_CTRL_A_DV2   },
+       { "dv1",  RTC_CTRL_A,      RTC_CTRL_A_DV1   },
+       { "dv0",  RTC_CTRL_A,      RTC_CTRL_A_DV0   },
+       { "rs3",  RTC_CTRL_A,      RTC_CTRL_A_RS3   },
+       { "rs2",  RTC_CTRL_A,      RTC_CTRL_A_RS2   },
+       { "rs1",  RTC_CTRL_A,      RTC_CTRL_A_RS1   },
+       { "rs0",  RTC_CTRL_A,      RTC_CTRL_A_RS0   },
+       { "set",  RTC_CTRL_B,      RTC_CTRL_B_SET   },
+       { "pie",  RTC_CTRL_B,      RTC_CTRL_B_PIE   },
+       { "aie",  RTC_CTRL_B,      RTC_CTRL_B_AIE   },
+       { "uie",  RTC_CTRL_B,      RTC_CTRL_B_UIE   },
+       { "sqwe", RTC_CTRL_B,      RTC_CTRL_B_SQWE  },
+       { "dm",   RTC_CTRL_B,      RTC_CTRL_B_DM    },
+       { "2412", RTC_CTRL_B,      RTC_CTRL_B_2412  },
+       { "dse",  RTC_CTRL_B,      RTC_CTRL_B_DSE   },
+       { "irqf", RTC_CTRL_C,      RTC_CTRL_C_IRQF  },
+       { "pf",   RTC_CTRL_C,      RTC_CTRL_C_PF    },
+       { "af",   RTC_CTRL_C,      RTC_CTRL_C_AF    },
+       { "uf",   RTC_CTRL_C,      RTC_CTRL_C_UF    },
+       { "vrt",  RTC_CTRL_D,      RTC_CTRL_D_VRT   },
+       { "vrt2", RTC_EXT_CTRL_4A, RTC_CTRL_4A_VRT2 },
+       { "incr", RTC_EXT_CTRL_4A, RTC_CTRL_4A_INCR },
+       { "pab",  RTC_EXT_CTRL_4A, RTC_CTRL_4A_PAB  },
+       { "rf",   RTC_EXT_CTRL_4A, RTC_CTRL_4A_RF   },
+       { "wf",   RTC_EXT_CTRL_4A, RTC_CTRL_4A_WF   },
+       { "kf",   RTC_EXT_CTRL_4A, RTC_CTRL_4A_KF   },
+#if !defined(CONFIG_RTC_DRV_DS1685) && !defined(CONFIG_RTC_DRV_DS1689)
+       { "bme",  RTC_EXT_CTRL_4A, RTC_CTRL_4A_BME  },
+#endif
+       { "abe",  RTC_EXT_CTRL_4B, RTC_CTRL_4B_ABE  },
+       { "e32k", RTC_EXT_CTRL_4B, RTC_CTRL_4B_E32K },
+       { "cs",   RTC_EXT_CTRL_4B, RTC_CTRL_4B_CS   },
+       { "rce",  RTC_EXT_CTRL_4B, RTC_CTRL_4B_RCE  },
+       { "prs",  RTC_EXT_CTRL_4B, RTC_CTRL_4B_PRS  },
+       { "rie",  RTC_EXT_CTRL_4B, RTC_CTRL_4B_RIE  },
+       { "wie",  RTC_EXT_CTRL_4B, RTC_CTRL_4B_WIE  },
+       { "kse",  RTC_EXT_CTRL_4B, RTC_CTRL_4B_KSE  },
+       { NULL,   0,               0                },
+};
+
+/**
+ * ds1685_rtc_sysfs_ctrl_regs_lookup - ctrl register bit lookup function.
+ * @name: ctrl register bit to look up in ds1685_ctrl_regs_table.
+ */
+static const struct ds1685_rtc_ctrl_regs*
+ds1685_rtc_sysfs_ctrl_regs_lookup(const char *name)
+{
+       const struct ds1685_rtc_ctrl_regs *p = ds1685_ctrl_regs_table;
+
+       for (; p->name != NULL; ++p)
+               if (strcmp(p->name, name) == 0)
+                       return p;
+
+       return NULL;
+}
+
+/**
+ * ds1685_rtc_sysfs_ctrl_regs_show - reads a ctrl register bit via sysfs.
+ * @dev: pointer to device structure.
+ * @attr: pointer to device_attribute structure.
+ * @buf: pointer to char array to hold the output.
+ */
+static ssize_t
+ds1685_rtc_sysfs_ctrl_regs_show(struct device *dev,
+                               struct device_attribute *attr, char *buf)
+{
+       u8 tmp;
+       struct ds1685_priv *rtc = dev_get_drvdata(dev);
+       const struct ds1685_rtc_ctrl_regs *reg_info =
+               ds1685_rtc_sysfs_ctrl_regs_lookup(attr->attr.name);
+
+       /* Make sure we actually matched something. */
+       if (!reg_info)
+               return -EINVAL;
+
+       /* No spinlock during a read -- mutex is already held. */
+       ds1685_rtc_switch_to_bank1(rtc);
+       tmp = rtc->read(rtc, reg_info->reg) & reg_info->bit;
+       ds1685_rtc_switch_to_bank0(rtc);
+
+       return snprintf(buf, 2, "%d\n", (tmp ? 1 : 0));
+}
+
+/**
+ * ds1685_rtc_sysfs_ctrl_regs_store - writes a ctrl register bit via sysfs.
+ * @dev: pointer to device structure.
+ * @attr: pointer to device_attribute structure.
+ * @buf: pointer to char array to hold the output.
+ * @count: number of bytes written.
+ */
+static ssize_t
+ds1685_rtc_sysfs_ctrl_regs_store(struct device *dev,
+                                struct device_attribute *attr,
+                                const char *buf, size_t count)
+{
+       struct ds1685_priv *rtc = dev_get_drvdata(dev);
+       u8 reg = 0, bit = 0, tmp;
+       unsigned long flags = 0;
+       long int val = 0;
+       const struct ds1685_rtc_ctrl_regs *reg_info =
+               ds1685_rtc_sysfs_ctrl_regs_lookup(attr->attr.name);
+
+       /* We only accept numbers. */
+       if (kstrtol(buf, 10, &val) < 0)
+               return -EINVAL;
+
+       /* bits are binary, 0 or 1 only. */
+       if ((val != 0) && (val != 1))
+               return -ERANGE;
+
+       /* Make sure we actually matched something. */
+       if (!reg_info)
+               return -EINVAL;
+
+       reg = reg_info->reg;
+       bit = reg_info->bit;
+
+       /* Safe to spinlock during a write. */
+       ds1685_rtc_begin_ctrl_access(rtc, flags);
+       tmp = rtc->read(rtc, reg);
+       rtc->write(rtc, reg, (val ? (tmp | bit) : (tmp & ~(bit))));
+       ds1685_rtc_end_ctrl_access(rtc, flags);
+
+       return count;
+}
+
+/**
+ * DS1685_RTC_SYSFS_CTRL_REG_RO - device_attribute for read-only register bit.
+ * @bit: bit to read.
+ */
+#define DS1685_RTC_SYSFS_CTRL_REG_RO(bit)                              \
+       static DEVICE_ATTR(bit, S_IRUGO,                                \
+       ds1685_rtc_sysfs_ctrl_regs_show, NULL)
+
+/**
+ * DS1685_RTC_SYSFS_CTRL_REG_RW - device_attribute for read-write register bit.
+ * @bit: bit to read or write.
+ */
+#define DS1685_RTC_SYSFS_CTRL_REG_RW(bit)                              \
+       static DEVICE_ATTR(bit, S_IRUGO | S_IWUSR,                      \
+       ds1685_rtc_sysfs_ctrl_regs_show,                                \
+       ds1685_rtc_sysfs_ctrl_regs_store)
+
+/*
+ * Control Register A bits.
+ */
+DS1685_RTC_SYSFS_CTRL_REG_RO(uip);
+DS1685_RTC_SYSFS_CTRL_REG_RW(dv2);
+DS1685_RTC_SYSFS_CTRL_REG_RW(dv1);
+DS1685_RTC_SYSFS_CTRL_REG_RO(dv0);
+DS1685_RTC_SYSFS_CTRL_REG_RW(rs3);
+DS1685_RTC_SYSFS_CTRL_REG_RW(rs2);
+DS1685_RTC_SYSFS_CTRL_REG_RW(rs1);
+DS1685_RTC_SYSFS_CTRL_REG_RW(rs0);
+
+static struct attribute*
+ds1685_rtc_sysfs_ctrla_attrs[] = {
+       &dev_attr_uip.attr,
+       &dev_attr_dv2.attr,
+       &dev_attr_dv1.attr,
+       &dev_attr_dv0.attr,
+       &dev_attr_rs3.attr,
+       &dev_attr_rs2.attr,
+       &dev_attr_rs1.attr,
+       &dev_attr_rs0.attr,
+       NULL,
+};
+
+static const struct attribute_group
+ds1685_rtc_sysfs_ctrla_grp = {
+       .name = "ctrla",
+       .attrs = ds1685_rtc_sysfs_ctrla_attrs,
+};
+
+
+/*
+ * Control Register B bits.
+ */
+DS1685_RTC_SYSFS_CTRL_REG_RO(set);
+DS1685_RTC_SYSFS_CTRL_REG_RW(pie);
+DS1685_RTC_SYSFS_CTRL_REG_RW(aie);
+DS1685_RTC_SYSFS_CTRL_REG_RW(uie);
+DS1685_RTC_SYSFS_CTRL_REG_RW(sqwe);
+DS1685_RTC_SYSFS_CTRL_REG_RO(dm);
+DS1685_RTC_SYSFS_CTRL_REG_RO(2412);
+DS1685_RTC_SYSFS_CTRL_REG_RO(dse);
+
+static struct attribute*
+ds1685_rtc_sysfs_ctrlb_attrs[] = {
+       &dev_attr_set.attr,
+       &dev_attr_pie.attr,
+       &dev_attr_aie.attr,
+       &dev_attr_uie.attr,
+       &dev_attr_sqwe.attr,
+       &dev_attr_dm.attr,
+       &dev_attr_2412.attr,
+       &dev_attr_dse.attr,
+       NULL,
+};
+
+static const struct attribute_group
+ds1685_rtc_sysfs_ctrlb_grp = {
+       .name = "ctrlb",
+       .attrs = ds1685_rtc_sysfs_ctrlb_attrs,
+};
+
+/*
+ * Control Register C bits.
+ *
+ * Reading Control C clears these bits!  Reading them individually can
+ * possibly cause an interrupt to be missed.  Use the /proc interface
+ * to see all the bits in this register simultaneously.
+ */
+DS1685_RTC_SYSFS_CTRL_REG_RO(irqf);
+DS1685_RTC_SYSFS_CTRL_REG_RO(pf);
+DS1685_RTC_SYSFS_CTRL_REG_RO(af);
+DS1685_RTC_SYSFS_CTRL_REG_RO(uf);
+
+static struct attribute*
+ds1685_rtc_sysfs_ctrlc_attrs[] = {
+       &dev_attr_irqf.attr,
+       &dev_attr_pf.attr,
+       &dev_attr_af.attr,
+       &dev_attr_uf.attr,
+       NULL,
+};
+
+static const struct attribute_group
+ds1685_rtc_sysfs_ctrlc_grp = {
+       .name = "ctrlc",
+       .attrs = ds1685_rtc_sysfs_ctrlc_attrs,
+};
+
+/*
+ * Control Register D bits.
+ */
+DS1685_RTC_SYSFS_CTRL_REG_RO(vrt);
+
+static struct attribute*
+ds1685_rtc_sysfs_ctrld_attrs[] = {
+       &dev_attr_vrt.attr,
+       NULL,
+};
+
+static const struct attribute_group
+ds1685_rtc_sysfs_ctrld_grp = {
+       .name = "ctrld",
+       .attrs = ds1685_rtc_sysfs_ctrld_attrs,
+};
+
+/*
+ * Control Register 4A bits.
+ */
+DS1685_RTC_SYSFS_CTRL_REG_RO(vrt2);
+DS1685_RTC_SYSFS_CTRL_REG_RO(incr);
+DS1685_RTC_SYSFS_CTRL_REG_RW(pab);
+DS1685_RTC_SYSFS_CTRL_REG_RW(rf);
+DS1685_RTC_SYSFS_CTRL_REG_RW(wf);
+DS1685_RTC_SYSFS_CTRL_REG_RW(kf);
+#if !defined(CONFIG_RTC_DRV_DS1685) && !defined(CONFIG_RTC_DRV_DS1689)
+DS1685_RTC_SYSFS_CTRL_REG_RO(bme);
+#endif
+
+static struct attribute*
+ds1685_rtc_sysfs_ctrl4a_attrs[] = {
+       &dev_attr_vrt2.attr,
+       &dev_attr_incr.attr,
+       &dev_attr_pab.attr,
+       &dev_attr_rf.attr,
+       &dev_attr_wf.attr,
+       &dev_attr_kf.attr,
+#if !defined(CONFIG_RTC_DRV_DS1685) && !defined(CONFIG_RTC_DRV_DS1689)
+       &dev_attr_bme.attr,
+#endif
+       NULL,
+};
+
+static const struct attribute_group
+ds1685_rtc_sysfs_ctrl4a_grp = {
+       .name = "ctrl4a",
+       .attrs = ds1685_rtc_sysfs_ctrl4a_attrs,
+};
+
+/*
+ * Control Register 4B bits.
+ */
+DS1685_RTC_SYSFS_CTRL_REG_RW(abe);
+DS1685_RTC_SYSFS_CTRL_REG_RW(e32k);
+DS1685_RTC_SYSFS_CTRL_REG_RO(cs);
+DS1685_RTC_SYSFS_CTRL_REG_RW(rce);
+DS1685_RTC_SYSFS_CTRL_REG_RW(prs);
+DS1685_RTC_SYSFS_CTRL_REG_RW(rie);
+DS1685_RTC_SYSFS_CTRL_REG_RW(wie);
+DS1685_RTC_SYSFS_CTRL_REG_RW(kse);
+
+static struct attribute*
+ds1685_rtc_sysfs_ctrl4b_attrs[] = {
+       &dev_attr_abe.attr,
+       &dev_attr_e32k.attr,
+       &dev_attr_cs.attr,
+       &dev_attr_rce.attr,
+       &dev_attr_prs.attr,
+       &dev_attr_rie.attr,
+       &dev_attr_wie.attr,
+       &dev_attr_kse.attr,
+       NULL,
+};
+
+static const struct attribute_group
+ds1685_rtc_sysfs_ctrl4b_grp = {
+       .name = "ctrl4b",
+       .attrs = ds1685_rtc_sysfs_ctrl4b_attrs,
+};
+
+
+/**
+ * struct ds1685_rtc_ctrl_regs.
+ * @name: char pointer for the bit name.
+ * @reg: control register the bit is in.
+ * @bit: the bit's offset in the register.
+ */
+struct ds1685_rtc_time_regs {
+       const char *name;
+       const u8 reg;
+       const u8 mask;
+       const u8 min;
+       const u8 max;
+};
+
+/*
+ * Time/Date register lookup tables.
+ */
+static const struct ds1685_rtc_time_regs
+ds1685_time_regs_bcd_table[] = {
+       { "seconds",       RTC_SECS,       RTC_SECS_BCD_MASK,   0, 59 },
+       { "minutes",       RTC_MINS,       RTC_MINS_BCD_MASK,   0, 59 },
+       { "hours",         RTC_HRS,        RTC_HRS_24_BCD_MASK, 0, 23 },
+       { "wday",          RTC_WDAY,       RTC_WDAY_MASK,       1,  7 },
+       { "mday",          RTC_MDAY,       RTC_MDAY_BCD_MASK,   1, 31 },
+       { "month",         RTC_MONTH,      RTC_MONTH_BCD_MASK,  1, 12 },
+       { "year",          RTC_YEAR,       RTC_YEAR_BCD_MASK,   0, 99 },
+       { "century",       RTC_CENTURY,    RTC_CENTURY_MASK,    0, 99 },
+       { "alarm_seconds", RTC_SECS_ALARM, RTC_SECS_BCD_MASK,   0, 59 },
+       { "alarm_minutes", RTC_MINS_ALARM, RTC_MINS_BCD_MASK,   0, 59 },
+       { "alarm_hours",   RTC_HRS_ALARM,  RTC_HRS_24_BCD_MASK, 0, 23 },
+       { "alarm_mday",    RTC_MDAY_ALARM, RTC_MDAY_ALARM_MASK, 1, 31 },
+       { NULL,            0,              0,                   0,  0 },
+};
+
+static const struct ds1685_rtc_time_regs
+ds1685_time_regs_bin_table[] = {
+       { "seconds",       RTC_SECS,       RTC_SECS_BIN_MASK,   0x00, 0x3b },
+       { "minutes",       RTC_MINS,       RTC_MINS_BIN_MASK,   0x00, 0x3b },
+       { "hours",         RTC_HRS,        RTC_HRS_24_BIN_MASK, 0x00, 0x17 },
+       { "wday",          RTC_WDAY,       RTC_WDAY_MASK,       0x01, 0x07 },
+       { "mday",          RTC_MDAY,       RTC_MDAY_BIN_MASK,   0x01, 0x1f },
+       { "month",         RTC_MONTH,      RTC_MONTH_BIN_MASK,  0x01, 0x0c },
+       { "year",          RTC_YEAR,       RTC_YEAR_BIN_MASK,   0x00, 0x63 },
+       { "century",       RTC_CENTURY,    RTC_CENTURY_MASK,    0x00, 0x63 },
+       { "alarm_seconds", RTC_SECS_ALARM, RTC_SECS_BIN_MASK,   0x00, 0x3b },
+       { "alarm_minutes", RTC_MINS_ALARM, RTC_MINS_BIN_MASK,   0x00, 0x3b },
+       { "alarm_hours",   RTC_HRS_ALARM,  RTC_HRS_24_BIN_MASK, 0x00, 0x17 },
+       { "alarm_mday",    RTC_MDAY_ALARM, RTC_MDAY_ALARM_MASK, 0x01, 0x1f },
+       { NULL,            0,              0,                   0x00, 0x00 },
+};
+
+/**
+ * ds1685_rtc_sysfs_time_regs_bcd_lookup - time/date reg bit lookup function.
+ * @name: register bit to look up in ds1685_time_regs_bcd_table.
+ */
+static const struct ds1685_rtc_time_regs*
+ds1685_rtc_sysfs_time_regs_lookup(const char *name, bool bcd_mode)
+{
+       const struct ds1685_rtc_time_regs *p;
+
+       if (bcd_mode)
+               p = ds1685_time_regs_bcd_table;
+       else
+               p = ds1685_time_regs_bin_table;
+
+       for (; p->name != NULL; ++p)
+               if (strcmp(p->name, name) == 0)
+                       return p;
+
+       return NULL;
+}
+
+/**
+ * ds1685_rtc_sysfs_time_regs_show - reads a time/date register via sysfs.
+ * @dev: pointer to device structure.
+ * @attr: pointer to device_attribute structure.
+ * @buf: pointer to char array to hold the output.
+ */
+static ssize_t
+ds1685_rtc_sysfs_time_regs_show(struct device *dev,
+                               struct device_attribute *attr, char *buf)
+{
+       u8 tmp;
+       struct ds1685_priv *rtc = dev_get_drvdata(dev);
+       const struct ds1685_rtc_time_regs *bcd_reg_info =
+               ds1685_rtc_sysfs_time_regs_lookup(attr->attr.name, true);
+       const struct ds1685_rtc_time_regs *bin_reg_info =
+               ds1685_rtc_sysfs_time_regs_lookup(attr->attr.name, false);
+
+       /* Make sure we actually matched something. */
+       if (!bcd_reg_info && !bin_reg_info)
+               return -EINVAL;
+
+       /* bcd_reg_info->reg == bin_reg_info->reg. */
+       ds1685_rtc_begin_data_access(rtc);
+       tmp = rtc->read(rtc, bcd_reg_info->reg);
+       ds1685_rtc_end_data_access(rtc);
+
+       tmp = ds1685_rtc_bcd2bin(rtc, tmp, bcd_reg_info->mask,
+                                bin_reg_info->mask);
+
+       return snprintf(buf, 4, "%d\n", tmp);
+}
+
+/**
+ * ds1685_rtc_sysfs_time_regs_store - writes a time/date register via sysfs.
+ * @dev: pointer to device structure.
+ * @attr: pointer to device_attribute structure.
+ * @buf: pointer to char array to hold the output.
+ * @count: number of bytes written.
+ */
+static ssize_t
+ds1685_rtc_sysfs_time_regs_store(struct device *dev,
+                                struct device_attribute *attr,
+                                const char *buf, size_t count)
+{
+       long int val = 0;
+       struct ds1685_priv *rtc = dev_get_drvdata(dev);
+       const struct ds1685_rtc_time_regs *bcd_reg_info =
+               ds1685_rtc_sysfs_time_regs_lookup(attr->attr.name, true);
+       const struct ds1685_rtc_time_regs *bin_reg_info =
+               ds1685_rtc_sysfs_time_regs_lookup(attr->attr.name, false);
+
+       /* We only accept numbers. */
+       if (kstrtol(buf, 10, &val) < 0)
+               return -EINVAL;
+
+       /* Make sure we actually matched something. */
+       if (!bcd_reg_info && !bin_reg_info)
+               return -EINVAL;
+
+       /* Check for a valid range. */
+       if (rtc->bcd_mode) {
+               if ((val < bcd_reg_info->min) || (val > bcd_reg_info->max))
+                       return -ERANGE;
+       } else {
+               if ((val < bin_reg_info->min) || (val > bin_reg_info->max))
+                       return -ERANGE;
+       }
+
+       val = ds1685_rtc_bin2bcd(rtc, val, bin_reg_info->mask,
+                                bcd_reg_info->mask);
+
+       /* bcd_reg_info->reg == bin_reg_info->reg. */
+       ds1685_rtc_begin_data_access(rtc);
+       rtc->write(rtc, bcd_reg_info->reg, val);
+       ds1685_rtc_end_data_access(rtc);
+
+       return count;
+}
+
+/**
+ * DS1685_RTC_SYSFS_REG_RW - device_attribute for a read-write time register.
+ * @reg: time/date register to read or write.
+ */
+#define DS1685_RTC_SYSFS_TIME_REG_RW(reg)                              \
+       static DEVICE_ATTR(reg, S_IRUGO | S_IWUSR,                      \
+       ds1685_rtc_sysfs_time_regs_show,                                \
+       ds1685_rtc_sysfs_time_regs_store)
+
+/*
+ * Time/Date Register bits.
+ */
+DS1685_RTC_SYSFS_TIME_REG_RW(seconds);
+DS1685_RTC_SYSFS_TIME_REG_RW(minutes);
+DS1685_RTC_SYSFS_TIME_REG_RW(hours);
+DS1685_RTC_SYSFS_TIME_REG_RW(wday);
+DS1685_RTC_SYSFS_TIME_REG_RW(mday);
+DS1685_RTC_SYSFS_TIME_REG_RW(month);
+DS1685_RTC_SYSFS_TIME_REG_RW(year);
+DS1685_RTC_SYSFS_TIME_REG_RW(century);
+DS1685_RTC_SYSFS_TIME_REG_RW(alarm_seconds);
+DS1685_RTC_SYSFS_TIME_REG_RW(alarm_minutes);
+DS1685_RTC_SYSFS_TIME_REG_RW(alarm_hours);
+DS1685_RTC_SYSFS_TIME_REG_RW(alarm_mday);
+
+static struct attribute*
+ds1685_rtc_sysfs_time_attrs[] = {
+       &dev_attr_seconds.attr,
+       &dev_attr_minutes.attr,
+       &dev_attr_hours.attr,
+       &dev_attr_wday.attr,
+       &dev_attr_mday.attr,
+       &dev_attr_month.attr,
+       &dev_attr_year.attr,
+       &dev_attr_century.attr,
+       NULL,
+};
+
+static const struct attribute_group
+ds1685_rtc_sysfs_time_grp = {
+       .name = "datetime",
+       .attrs = ds1685_rtc_sysfs_time_attrs,
+};
+
+static struct attribute*
+ds1685_rtc_sysfs_alarm_attrs[] = {
+       &dev_attr_alarm_seconds.attr,
+       &dev_attr_alarm_minutes.attr,
+       &dev_attr_alarm_hours.attr,
+       &dev_attr_alarm_mday.attr,
+       NULL,
+};
+
+static const struct attribute_group
+ds1685_rtc_sysfs_alarm_grp = {
+       .name = "alarm",
+       .attrs = ds1685_rtc_sysfs_alarm_attrs,
+};
+#endif /* CONFIG_RTC_DS1685_SYSFS_REGS */
+
+
+/**
+ * ds1685_rtc_sysfs_register - register sysfs files.
+ * @dev: pointer to device structure.
+ */
+static int
+ds1685_rtc_sysfs_register(struct device *dev)
+{
+       int ret = 0;
+
+       sysfs_bin_attr_init(&ds1685_rtc_sysfs_nvram_attr);
+       ret = sysfs_create_bin_file(&dev->kobj, &ds1685_rtc_sysfs_nvram_attr);
+       if (ret)
+               return ret;
+
+       ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_misc_grp);
+       if (ret)
+               return ret;
+
+#ifdef CONFIG_RTC_DS1685_SYSFS_REGS
+       ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_ctrla_grp);
+       if (ret)
+               return ret;
+
+       ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_ctrlb_grp);
+       if (ret)
+               return ret;
+
+       ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_ctrlc_grp);
+       if (ret)
+               return ret;
+
+       ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_ctrld_grp);
+       if (ret)
+               return ret;
+
+       ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_ctrl4a_grp);
+       if (ret)
+               return ret;
+
+       ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_ctrl4b_grp);
+       if (ret)
+               return ret;
+
+       ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_time_grp);
+       if (ret)
+               return ret;
+
+       ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_alarm_grp);
+       if (ret)
+               return ret;
+#endif
+       return 0;
+}
+
+/**
+ * ds1685_rtc_sysfs_unregister - unregister sysfs files.
+ * @dev: pointer to device structure.
+ */
+static int
+ds1685_rtc_sysfs_unregister(struct device *dev)
+{
+       sysfs_remove_bin_file(&dev->kobj, &ds1685_rtc_sysfs_nvram_attr);
+       sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_misc_grp);
+
+#ifdef CONFIG_RTC_DS1685_SYSFS_REGS
+       sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_ctrla_grp);
+       sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_ctrlb_grp);
+       sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_ctrlc_grp);
+       sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_ctrld_grp);
+       sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_ctrl4a_grp);
+       sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_ctrl4b_grp);
+       sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_time_grp);
+       sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_alarm_grp);
+#endif
+
+       return 0;
+}
+#endif /* CONFIG_SYSFS */
+
+
+
+/* ----------------------------------------------------------------------- */
+/* Driver Probe/Removal */
+
+/**
+ * ds1685_rtc_probe - initializes rtc driver.
+ * @pdev: pointer to platform_device structure.
+ */
+static int
+ds1685_rtc_probe(struct platform_device *pdev)
+{
+       struct rtc_device *rtc_dev;
+       struct resource *res;
+       struct ds1685_priv *rtc;
+       struct ds1685_rtc_platform_data *pdata;
+       u8 ctrla, ctrlb, hours;
+       unsigned char am_pm;
+       int ret = 0;
+
+       /* Get the platform data. */
+       pdata = (struct ds1685_rtc_platform_data *) pdev->dev.platform_data;
+       if (!pdata)
+               return -ENODEV;
+
+       /* Allocate memory for the rtc device. */
+       rtc = devm_kzalloc(&pdev->dev, sizeof(*rtc), GFP_KERNEL);
+       if (!rtc)
+               return -ENOMEM;
+
+       /*
+        * Allocate/setup any IORESOURCE_MEM resources, if required.  Not all
+        * platforms put the RTC in an easy-access place.  Like the SGI Octane,
+        * which attaches the RTC to a "ByteBus", hooked to a SuperIO chip
+        * that sits behind the IOC3 PCI metadevice.
+        */
+       if (pdata->alloc_io_resources) {
+               /* Get the platform resources. */
+               res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+               if (!res)
+                       return -ENXIO;
+               rtc->size = resource_size(res);
+
+               /* Request a memory region. */
+               /* XXX: mmio-only for now. */
+               if (!devm_request_mem_region(&pdev->dev, res->start, rtc->size,
+                                            pdev->name))
+                       return -EBUSY;
+
+               /*
+                * Set the base address for the rtc, and ioremap its
+                * registers.
+                */
+               rtc->baseaddr = res->start;
+               rtc->regs = devm_ioremap(&pdev->dev, res->start, rtc->size);
+               if (!rtc->regs)
+                       return -ENOMEM;
+       }
+       rtc->alloc_io_resources = pdata->alloc_io_resources;
+
+       /* Get the register step size. */
+       if (pdata->regstep > 0)
+               rtc->regstep = pdata->regstep;
+       else
+               rtc->regstep = 1;
+
+       /* Platform read function, else default if mmio setup */
+       if (pdata->plat_read)
+               rtc->read = pdata->plat_read;
+       else
+               if (pdata->alloc_io_resources)
+                       rtc->read = ds1685_read;
+               else
+                       return -ENXIO;
+
+       /* Platform write function, else default if mmio setup */
+       if (pdata->plat_write)
+               rtc->write = pdata->plat_write;
+       else
+               if (pdata->alloc_io_resources)
+                       rtc->write = ds1685_write;
+               else
+                       return -ENXIO;
+
+       /* Platform pre-shutdown function, if defined. */
+       if (pdata->plat_prepare_poweroff)
+               rtc->prepare_poweroff = pdata->plat_prepare_poweroff;
+
+       /* Platform wake_alarm function, if defined. */
+       if (pdata->plat_wake_alarm)
+               rtc->wake_alarm = pdata->plat_wake_alarm;
+
+       /* Platform post_ram_clear function, if defined. */
+       if (pdata->plat_post_ram_clear)
+               rtc->post_ram_clear = pdata->plat_post_ram_clear;
+
+       /* Init the spinlock, workqueue, & set the driver data. */
+       spin_lock_init(&rtc->lock);
+       INIT_WORK(&rtc->work, ds1685_rtc_work_queue);
+       platform_set_drvdata(pdev, rtc);
+
+       /* Turn the oscillator on if is not already on (DV1 = 1). */
+       ctrla = rtc->read(rtc, RTC_CTRL_A);
+       if (!(ctrla & RTC_CTRL_A_DV1))
+               ctrla |= RTC_CTRL_A_DV1;
+
+       /* Enable the countdown chain (DV2 = 0) */
+       ctrla &= ~(RTC_CTRL_A_DV2);
+
+       /* Clear RS3-RS0 in Control A. */
+       ctrla &= ~(RTC_CTRL_A_RS_MASK);
+
+       /*
+        * All done with Control A.  Switch to Bank 1 for the remainder of
+        * the RTC setup so we have access to the extended functions.
+        */
+       ctrla |= RTC_CTRL_A_DV0;
+       rtc->write(rtc, RTC_CTRL_A, ctrla);
+
+       /* Default to 32768kHz output. */
+       rtc->write(rtc, RTC_EXT_CTRL_4B,
+                  (rtc->read(rtc, RTC_EXT_CTRL_4B) | RTC_CTRL_4B_E32K));
+
+       /* Set the SET bit in Control B so we can do some housekeeping. */
+       rtc->write(rtc, RTC_CTRL_B,
+                  (rtc->read(rtc, RTC_CTRL_B) | RTC_CTRL_B_SET));
+
+       /* Read Ext Ctrl 4A and check the INCR bit to avoid a lockout. */
+       while (rtc->read(rtc, RTC_EXT_CTRL_4A) & RTC_CTRL_4A_INCR)
+               cpu_relax();
+
+       /*
+        * If the platform supports BCD mode, then set DM=0 in Control B.
+        * Otherwise, set DM=1 for BIN mode.
+        */
+       ctrlb = rtc->read(rtc, RTC_CTRL_B);
+       if (pdata->bcd_mode)
+               ctrlb &= ~(RTC_CTRL_B_DM);
+       else
+               ctrlb |= RTC_CTRL_B_DM;
+       rtc->bcd_mode = pdata->bcd_mode;
+
+       /*
+        * Disable Daylight Savings Time (DSE = 0).
+        * The RTC has hardcoded timezone information that is rendered
+        * obselete.  We'll let the OS deal with DST settings instead.
+        */
+       if (ctrlb & RTC_CTRL_B_DSE)
+               ctrlb &= ~(RTC_CTRL_B_DSE);
+
+       /* Force 24-hour mode (2412 = 1). */
+       if (!(ctrlb & RTC_CTRL_B_2412)) {
+               /* Reinitialize the time hours. */
+               hours = rtc->read(rtc, RTC_HRS);
+               am_pm = hours & RTC_HRS_AMPM_MASK;
+               hours = ds1685_rtc_bcd2bin(rtc, hours, RTC_HRS_12_BCD_MASK,
+                                          RTC_HRS_12_BIN_MASK);
+               hours = ((hours == 12) ? 0 : ((am_pm) ? hours + 12 : hours));
+
+               /* Enable 24-hour mode. */
+               ctrlb |= RTC_CTRL_B_2412;
+
+               /* Write back to Control B, including DM & DSE bits. */
+               rtc->write(rtc, RTC_CTRL_B, ctrlb);
+
+               /* Write the time hours back. */
+               rtc->write(rtc, RTC_HRS,
+                          ds1685_rtc_bin2bcd(rtc, hours,
+                                             RTC_HRS_24_BIN_MASK,
+                                             RTC_HRS_24_BCD_MASK));
+
+               /* Reinitialize the alarm hours. */
+               hours = rtc->read(rtc, RTC_HRS_ALARM);
+               am_pm = hours & RTC_HRS_AMPM_MASK;
+               hours = ds1685_rtc_bcd2bin(rtc, hours, RTC_HRS_12_BCD_MASK,
+                                          RTC_HRS_12_BIN_MASK);
+               hours = ((hours == 12) ? 0 : ((am_pm) ? hours + 12 : hours));
+
+               /* Write the alarm hours back. */
+               rtc->write(rtc, RTC_HRS_ALARM,
+                          ds1685_rtc_bin2bcd(rtc, hours,
+                                             RTC_HRS_24_BIN_MASK,
+                                             RTC_HRS_24_BCD_MASK));
+       } else {
+               /* 24-hour mode is already set, so write Control B back. */
+               rtc->write(rtc, RTC_CTRL_B, ctrlb);
+       }
+
+       /* Unset the SET bit in Control B so the RTC can update. */
+       rtc->write(rtc, RTC_CTRL_B,
+                  (rtc->read(rtc, RTC_CTRL_B) & ~(RTC_CTRL_B_SET)));
+
+       /* Check the main battery. */
+       if (!(rtc->read(rtc, RTC_CTRL_D) & RTC_CTRL_D_VRT))
+               dev_warn(&pdev->dev,
+                        "Main battery is exhausted! RTC may be invalid!\n");
+
+       /* Check the auxillary battery.  It is optional. */
+       if (!(rtc->read(rtc, RTC_EXT_CTRL_4A) & RTC_CTRL_4A_VRT2))
+               dev_warn(&pdev->dev,
+                        "Aux battery is exhausted or not available.\n");
+
+       /* Read Ctrl B and clear PIE/AIE/UIE. */
+       rtc->write(rtc, RTC_CTRL_B,
+                  (rtc->read(rtc, RTC_CTRL_B) & ~(RTC_CTRL_B_PAU_MASK)));
+
+       /* Reading Ctrl C auto-clears PF/AF/UF. */
+       rtc->read(rtc, RTC_CTRL_C);
+
+       /* Read Ctrl 4B and clear RIE/WIE/KSE. */
+       rtc->write(rtc, RTC_EXT_CTRL_4B,
+                  (rtc->read(rtc, RTC_EXT_CTRL_4B) & ~(RTC_CTRL_4B_RWK_MASK)));
+
+       /* Clear RF/WF/KF in Ctrl 4A. */
+       rtc->write(rtc, RTC_EXT_CTRL_4A,
+                  (rtc->read(rtc, RTC_EXT_CTRL_4A) & ~(RTC_CTRL_4A_RWK_MASK)));
+
+       /*
+        * Re-enable KSE to handle power button events.  We do not enable
+        * WIE or RIE by default.
+        */
+       rtc->write(rtc, RTC_EXT_CTRL_4B,
+                  (rtc->read(rtc, RTC_EXT_CTRL_4B) | RTC_CTRL_4B_KSE));
+
+       /*
+        * Fetch the IRQ and setup the interrupt handler.
+        *
+        * Not all platforms have the IRQF pin tied to something.  If not, the
+        * RTC will still set the *IE / *F flags and raise IRQF in ctrlc, but
+        * there won't be an automatic way of notifying the kernel about it,
+        * unless ctrlc is explicitly polled.
+        */
+       if (!pdata->no_irq) {
+               ret = platform_get_irq(pdev, 0);
+               if (ret > 0) {
+                       rtc->irq_num = ret;
+
+                       /* Request an IRQ. */
+                       ret = devm_request_irq(&pdev->dev, rtc->irq_num,
+                                              ds1685_rtc_irq_handler,
+                                              IRQF_SHARED, pdev->name, pdev);
+
+                       /* Check to see if something came back. */
+                       if (unlikely(ret)) {
+                               dev_warn(&pdev->dev,
+                                        "RTC interrupt not available\n");
+                               rtc->irq_num = 0;
+                       }
+               } else
+                       return ret;
+       }
+       rtc->no_irq = pdata->no_irq;
+
+       /* Setup complete. */
+       ds1685_rtc_switch_to_bank0(rtc);
+
+       /* Register the device as an RTC. */
+       rtc_dev = rtc_device_register(pdev->name, &pdev->dev,
+                                     &ds1685_rtc_ops, THIS_MODULE);
+
+       /* Success? */
+       if (IS_ERR(rtc_dev))
+               return PTR_ERR(rtc_dev);
+
+       /* Maximum periodic rate is 8192Hz (0.122070ms). */
+       rtc_dev->max_user_freq = RTC_MAX_USER_FREQ;
+
+       /* See if the platform doesn't support UIE. */
+       if (pdata->uie_unsupported)
+               rtc_dev->uie_unsupported = 1;
+       rtc->uie_unsupported = pdata->uie_unsupported;
+
+       rtc->dev = rtc_dev;
+
+#ifdef CONFIG_SYSFS
+       ret = ds1685_rtc_sysfs_register(&pdev->dev);
+       if (ret)
+               rtc_device_unregister(rtc->dev);
+#endif
+
+       /* Done! */
+       return ret;
+}
+
+/**
+ * ds1685_rtc_remove - removes rtc driver.
+ * @pdev: pointer to platform_device structure.
+ */
+static int
+ds1685_rtc_remove(struct platform_device *pdev)
+{
+       struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+
+#ifdef CONFIG_SYSFS
+       ds1685_rtc_sysfs_unregister(&pdev->dev);
+#endif
+
+       rtc_device_unregister(rtc->dev);
+
+       /* Read Ctrl B and clear PIE/AIE/UIE. */
+       rtc->write(rtc, RTC_CTRL_B,
+                  (rtc->read(rtc, RTC_CTRL_B) &
+                   ~(RTC_CTRL_B_PAU_MASK)));
+
+       /* Reading Ctrl C auto-clears PF/AF/UF. */
+       rtc->read(rtc, RTC_CTRL_C);
+
+       /* Read Ctrl 4B and clear RIE/WIE/KSE. */
+       rtc->write(rtc, RTC_EXT_CTRL_4B,
+                  (rtc->read(rtc, RTC_EXT_CTRL_4B) &
+                   ~(RTC_CTRL_4B_RWK_MASK)));
+
+       /* Manually clear RF/WF/KF in Ctrl 4A. */
+       rtc->write(rtc, RTC_EXT_CTRL_4A,
+                  (rtc->read(rtc, RTC_EXT_CTRL_4A) &
+                   ~(RTC_CTRL_4A_RWK_MASK)));
+
+       cancel_work_sync(&rtc->work);
+
+       return 0;
+}
+
+/**
+ * ds1685_rtc_driver - rtc driver properties.
+ */
+static struct platform_driver ds1685_rtc_driver = {
+       .driver         = {
+               .name   = "rtc-ds1685",
+               .owner  = THIS_MODULE,
+       },
+       .probe          = ds1685_rtc_probe,
+       .remove         = ds1685_rtc_remove,
+};
+
+/**
+ * ds1685_rtc_init - rtc module init.
+ */
+static int __init
+ds1685_rtc_init(void)
+{
+       return platform_driver_register(&ds1685_rtc_driver);
+}
+
+/**
+ * ds1685_rtc_exit - rtc module exit.
+ */
+static void __exit
+ds1685_rtc_exit(void)
+{
+       platform_driver_unregister(&ds1685_rtc_driver);
+}
+
+module_init(ds1685_rtc_init);
+module_exit(ds1685_rtc_exit);
+/* ----------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- */
+/* Poweroff function */
+
+/**
+ * ds1685_rtc_poweroff - uses the RTC chip to power the system off.
+ * @pdev: pointer to platform_device structure.
+ */
+extern void __noreturn
+ds1685_rtc_poweroff(struct platform_device *pdev)
+{
+       u8 ctrla, ctrl4a, ctrl4b;
+       struct ds1685_priv *rtc;
+
+       /* Check for valid RTC data, else, spin forever. */
+       if (unlikely(!pdev)) {
+               pr_emerg("rtc-ds1685: platform device data not available, spinning forever ...\n");
+               unreachable();
+       } else {
+               /* Get the rtc data. */
+               rtc = platform_get_drvdata(pdev);
+
+               /*
+                * Disable our IRQ.  We're powering down, so we're not
+                * going to worry about cleaning up.  Most of that should
+                * have been taken care of by the shutdown scripts and this
+                * is the final function call.
+                */
+               if (!rtc->no_irq)
+                       disable_irq_nosync(rtc->irq_num);
+
+               /* Oscillator must be on and the countdown chain enabled. */
+               ctrla = rtc->read(rtc, RTC_CTRL_A);
+               ctrla |= RTC_CTRL_A_DV1;
+               ctrla &= ~(RTC_CTRL_A_DV2);
+               rtc->write(rtc, RTC_CTRL_A, ctrla);
+
+               /*
+                * Read Control 4A and check the status of the auxillary
+                * battery.  This must be present and working (VRT2 = 1)
+                * for wakeup and kickstart functionality to be useful.
+                */
+               ds1685_rtc_switch_to_bank1(rtc);
+               ctrl4a = rtc->read(rtc, RTC_EXT_CTRL_4A);
+               if (ctrl4a & RTC_CTRL_4A_VRT2) {
+                       /* Clear all of the interrupt flags on Control 4A. */
+                       ctrl4a &= ~(RTC_CTRL_4A_RWK_MASK);
+                       rtc->write(rtc, RTC_EXT_CTRL_4A, ctrl4a);
+
+                       /*
+                        * The auxillary battery is present and working.
+                        * Enable extended functions (ABE=1), enable
+                        * wake-up (WIE=1), and enable kickstart (KSE=1)
+                        * in Control 4B.
+                        */
+                       ctrl4b = rtc->read(rtc, RTC_EXT_CTRL_4B);
+                       ctrl4b |= (RTC_CTRL_4B_ABE | RTC_CTRL_4B_WIE |
+                                  RTC_CTRL_4B_KSE);
+                       rtc->write(rtc, RTC_EXT_CTRL_4B, ctrl4b);
+               }
+
+               /* Set PAB to 1 in Control 4A to power the system down. */
+               dev_warn(&pdev->dev, "Powerdown.\n");
+               msleep(20);
+               rtc->write(rtc, RTC_EXT_CTRL_4A,
+                          (ctrl4a | RTC_CTRL_4A_PAB));
+
+               /* Spin ... we do not switch back to bank0. */
+               unreachable();
+       }
+}
+EXPORT_SYMBOL(ds1685_rtc_poweroff);
+/* ----------------------------------------------------------------------- */
+
+
+MODULE_AUTHOR("Joshua Kinard <kumba@gentoo.org>");
+MODULE_AUTHOR("Matthias Fuchs <matthias.fuchs@esd-electronics.com>");
+MODULE_DESCRIPTION("Dallas/Maxim DS1685/DS1687-series RTC driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_VERSION);
+MODULE_ALIAS("platform:rtc-ds1685");
index ee3ba7e6b45e148d3f03c9d671ddefbcbc02f282..f9b082784b9064a234aa313ec6836361d172d0d6 100644 (file)
@@ -275,7 +275,8 @@ static int isl12022_probe(struct i2c_client *client,
 
 #ifdef CONFIG_OF
 static const struct of_device_id isl12022_dt_match[] = {
-       { .compatible = "isl,isl12022" },
+       { .compatible = "isl,isl12022" }, /* for backward compat., don't use */
+       { .compatible = "isil,isl12022" },
        { },
 };
 #endif
index b8f862953f7f8331fca38d53949ff2b8bd5c6fbd..da818d3337cec5d2bd552ac046286763fbf4a1fa 100644 (file)
@@ -644,7 +644,8 @@ static SIMPLE_DEV_PM_OPS(isl12057_rtc_pm_ops, isl12057_rtc_suspend,
 
 #ifdef CONFIG_OF
 static const struct of_device_id isl12057_dt_match[] = {
-       { .compatible = "isl,isl12057" },
+       { .compatible = "isl,isl12057" }, /* for backward compat., don't use */
+       { .compatible = "isil,isl12057" },
        { },
 };
 #endif
index e969107ddb47e18727223a67b9ad5844ba65619e..6440e3b293ca53201714b57ea590a5f397368abf 100644 (file)
@@ -537,8 +537,8 @@ static const struct i2c_device_id isl29028_id[] = {
 MODULE_DEVICE_TABLE(i2c, isl29028_id);
 
 static const struct of_device_id isl29028_of_match[] = {
-       { .compatible = "isl,isl29028", },
-       { .compatible = "isil,isl29028", },/* deprecated, don't use */
+       { .compatible = "isl,isl29028", }, /* for backward compat., don't use */
+       { .compatible = "isil,isl29028", },
        { },
 };
 MODULE_DEVICE_TABLE(of, isl29028_of_match);
index a6bb530b1ec5457a24a7e738a5135c4e62c29a99..ec35851e5b71c4a19897bf97a70d5ec80fb67128 100644 (file)
@@ -13,13 +13,6 @@ if BLOCK
 source "fs/ext2/Kconfig"
 source "fs/ext3/Kconfig"
 source "fs/ext4/Kconfig"
-
-config FS_XIP
-# execute in place
-       bool
-       depends on EXT2_FS_XIP
-       default y
-
 source "fs/jbd/Kconfig"
 source "fs/jbd2/Kconfig"
 
@@ -40,6 +33,21 @@ source "fs/ocfs2/Kconfig"
 source "fs/btrfs/Kconfig"
 source "fs/nilfs2/Kconfig"
 
+config FS_DAX
+       bool "Direct Access (DAX) support"
+       depends on MMU
+       depends on !(ARM || MIPS || SPARC)
+       help
+         Direct Access (DAX) can be used on memory-backed block devices.
+         If the block device supports DAX and the filesystem supports DAX,
+         then you can avoid using the pagecache to buffer I/Os.  Turning
+         on this option will compile in support for DAX; you will need to
+         mount the filesystem using the -o dax option.
+
+         If you do not have a block device that is capable of using this,
+         or if unsure, say N.  Saying Y will increase the size of the kernel
+         by about 5kB.
+
 endif # BLOCK
 
 # Posix ACL utility routines
index bedff48e8fdca7547a00e1f5c0a4c6f9ac95e351..0f4635f7c49ca1912f1e4297bca816d22b121cce 100644 (file)
@@ -28,6 +28,7 @@ obj-$(CONFIG_SIGNALFD)                += signalfd.o
 obj-$(CONFIG_TIMERFD)          += timerfd.o
 obj-$(CONFIG_EVENTFD)          += eventfd.o
 obj-$(CONFIG_AIO)               += aio.o
+obj-$(CONFIG_FS_DAX)           += dax.o
 obj-$(CONFIG_FILE_LOCKING)      += locks.o
 obj-$(CONFIG_COMPAT)           += compat.o compat_ioctl.o
 obj-$(CONFIG_BINFMT_AOUT)      += binfmt_aout.o
diff --git a/fs/dax.c b/fs/dax.c
new file mode 100644 (file)
index 0000000..ed1619e
--- /dev/null
+++ b/fs/dax.c
@@ -0,0 +1,534 @@
+/*
+ * fs/dax.c - Direct Access filesystem code
+ * Copyright (c) 2013-2014 Intel Corporation
+ * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
+ * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/atomic.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/highmem.h>
+#include <linux/memcontrol.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/uio.h>
+#include <linux/vmstat.h>
+
+int dax_clear_blocks(struct inode *inode, sector_t block, long size)
+{
+       struct block_device *bdev = inode->i_sb->s_bdev;
+       sector_t sector = block << (inode->i_blkbits - 9);
+
+       might_sleep();
+       do {
+               void *addr;
+               unsigned long pfn;
+               long count;
+
+               count = bdev_direct_access(bdev, sector, &addr, &pfn, size);
+               if (count < 0)
+                       return count;
+               BUG_ON(size < count);
+               while (count > 0) {
+                       unsigned pgsz = PAGE_SIZE - offset_in_page(addr);
+                       if (pgsz > count)
+                               pgsz = count;
+                       if (pgsz < PAGE_SIZE)
+                               memset(addr, 0, pgsz);
+                       else
+                               clear_page(addr);
+                       addr += pgsz;
+                       size -= pgsz;
+                       count -= pgsz;
+                       BUG_ON(pgsz & 511);
+                       sector += pgsz / 512;
+                       cond_resched();
+               }
+       } while (size);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(dax_clear_blocks);
+
+static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits)
+{
+       unsigned long pfn;
+       sector_t sector = bh->b_blocknr << (blkbits - 9);
+       return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);
+}
+
+static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos,
+                       loff_t end)
+{
+       loff_t final = end - pos + first; /* The final byte of the buffer */
+
+       if (first > 0)
+               memset(addr, 0, first);
+       if (final < size)
+               memset(addr + final, 0, size - final);
+}
+
+static bool buffer_written(struct buffer_head *bh)
+{
+       return buffer_mapped(bh) && !buffer_unwritten(bh);
+}
+
+/*
+ * When ext4 encounters a hole, it returns without modifying the buffer_head
+ * which means that we can't trust b_size.  To cope with this, we set b_state
+ * to 0 before calling get_block and, if any bit is set, we know we can trust
+ * b_size.  Unfortunate, really, since ext4 knows precisely how long a hole is
+ * and would save us time calling get_block repeatedly.
+ */
+static bool buffer_size_valid(struct buffer_head *bh)
+{
+       return bh->b_state != 0;
+}
+
+static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter,
+                       loff_t start, loff_t end, get_block_t get_block,
+                       struct buffer_head *bh)
+{
+       ssize_t retval = 0;
+       loff_t pos = start;
+       loff_t max = start;
+       loff_t bh_max = start;
+       void *addr;
+       bool hole = false;
+
+       if (rw != WRITE)
+               end = min(end, i_size_read(inode));
+
+       while (pos < end) {
+               unsigned len;
+               if (pos == max) {
+                       unsigned blkbits = inode->i_blkbits;
+                       sector_t block = pos >> blkbits;
+                       unsigned first = pos - (block << blkbits);
+                       long size;
+
+                       if (pos == bh_max) {
+                               bh->b_size = PAGE_ALIGN(end - pos);
+                               bh->b_state = 0;
+                               retval = get_block(inode, block, bh,
+                                                               rw == WRITE);
+                               if (retval)
+                                       break;
+                               if (!buffer_size_valid(bh))
+                                       bh->b_size = 1 << blkbits;
+                               bh_max = pos - first + bh->b_size;
+                       } else {
+                               unsigned done = bh->b_size -
+                                               (bh_max - (pos - first));
+                               bh->b_blocknr += done >> blkbits;
+                               bh->b_size -= done;
+                       }
+
+                       hole = (rw != WRITE) && !buffer_written(bh);
+                       if (hole) {
+                               addr = NULL;
+                               size = bh->b_size - first;
+                       } else {
+                               retval = dax_get_addr(bh, &addr, blkbits);
+                               if (retval < 0)
+                                       break;
+                               if (buffer_unwritten(bh) || buffer_new(bh))
+                                       dax_new_buf(addr, retval, first, pos,
+                                                                       end);
+                               addr += first;
+                               size = retval - first;
+                       }
+                       max = min(pos + size, end);
+               }
+
+               if (rw == WRITE)
+                       len = copy_from_iter(addr, max - pos, iter);
+               else if (!hole)
+                       len = copy_to_iter(addr, max - pos, iter);
+               else
+                       len = iov_iter_zero(max - pos, iter);
+
+               if (!len)
+                       break;
+
+               pos += len;
+               addr += len;
+       }
+
+       return (pos == start) ? retval : pos - start;
+}
+
+/**
+ * dax_do_io - Perform I/O to a DAX file
+ * @rw: READ to read or WRITE to write
+ * @iocb: The control block for this I/O
+ * @inode: The file which the I/O is directed at
+ * @iter: The addresses to do I/O from or to
+ * @pos: The file offset where the I/O starts
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ * @end_io: A filesystem callback for I/O completion
+ * @flags: See below
+ *
+ * This function uses the same locking scheme as do_blockdev_direct_IO:
+ * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
+ * caller for writes.  For reads, we take and release the i_mutex ourselves.
+ * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
+ * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
+ * is in progress.
+ */
+ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode,
+                       struct iov_iter *iter, loff_t pos,
+                       get_block_t get_block, dio_iodone_t end_io, int flags)
+{
+       struct buffer_head bh;
+       ssize_t retval = -EINVAL;
+       loff_t end = pos + iov_iter_count(iter);
+
+       memset(&bh, 0, sizeof(bh));
+
+       if ((flags & DIO_LOCKING) && (rw == READ)) {
+               struct address_space *mapping = inode->i_mapping;
+               mutex_lock(&inode->i_mutex);
+               retval = filemap_write_and_wait_range(mapping, pos, end - 1);
+               if (retval) {
+                       mutex_unlock(&inode->i_mutex);
+                       goto out;
+               }
+       }
+
+       /* Protects against truncate */
+       atomic_inc(&inode->i_dio_count);
+
+       retval = dax_io(rw, inode, iter, pos, end, get_block, &bh);
+
+       if ((flags & DIO_LOCKING) && (rw == READ))
+               mutex_unlock(&inode->i_mutex);
+
+       if ((retval > 0) && end_io)
+               end_io(iocb, pos, retval, bh.b_private);
+
+       inode_dio_done(inode);
+ out:
+       return retval;
+}
+EXPORT_SYMBOL_GPL(dax_do_io);
+
+/*
+ * The user has performed a load from a hole in the file.  Allocating
+ * a new page in the file would cause excessive storage usage for
+ * workloads with sparse files.  We allocate a page cache page instead.
+ * We'll kick it out of the page cache if it's ever written to,
+ * otherwise it will simply fall out of the page cache under memory
+ * pressure without ever having been dirtied.
+ */
+static int dax_load_hole(struct address_space *mapping, struct page *page,
+                                                       struct vm_fault *vmf)
+{
+       unsigned long size;
+       struct inode *inode = mapping->host;
+       if (!page)
+               page = find_or_create_page(mapping, vmf->pgoff,
+                                               GFP_KERNEL | __GFP_ZERO);
+       if (!page)
+               return VM_FAULT_OOM;
+       /* Recheck i_size under page lock to avoid truncate race */
+       size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+       if (vmf->pgoff >= size) {
+               unlock_page(page);
+               page_cache_release(page);
+               return VM_FAULT_SIGBUS;
+       }
+
+       vmf->page = page;
+       return VM_FAULT_LOCKED;
+}
+
+static int copy_user_bh(struct page *to, struct buffer_head *bh,
+                       unsigned blkbits, unsigned long vaddr)
+{
+       void *vfrom, *vto;
+       if (dax_get_addr(bh, &vfrom, blkbits) < 0)
+               return -EIO;
+       vto = kmap_atomic(to);
+       copy_user_page(vto, vfrom, vaddr, to);
+       kunmap_atomic(vto);
+       return 0;
+}
+
+static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
+                       struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       struct address_space *mapping = inode->i_mapping;
+       sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
+       unsigned long vaddr = (unsigned long)vmf->virtual_address;
+       void *addr;
+       unsigned long pfn;
+       pgoff_t size;
+       int error;
+
+       i_mmap_lock_read(mapping);
+
+       /*
+        * Check truncate didn't happen while we were allocating a block.
+        * If it did, this block may or may not be still allocated to the
+        * file.  We can't tell the filesystem to free it because we can't
+        * take i_mutex here.  In the worst case, the file still has blocks
+        * allocated past the end of the file.
+        */
+       size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+       if (unlikely(vmf->pgoff >= size)) {
+               error = -EIO;
+               goto out;
+       }
+
+       error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size);
+       if (error < 0)
+               goto out;
+       if (error < PAGE_SIZE) {
+               error = -EIO;
+               goto out;
+       }
+
+       if (buffer_unwritten(bh) || buffer_new(bh))
+               clear_page(addr);
+
+       error = vm_insert_mixed(vma, vaddr, pfn);
+
+ out:
+       i_mmap_unlock_read(mapping);
+
+       if (bh->b_end_io)
+               bh->b_end_io(bh, 1);
+
+       return error;
+}
+
+static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+                       get_block_t get_block)
+{
+       struct file *file = vma->vm_file;
+       struct address_space *mapping = file->f_mapping;
+       struct inode *inode = mapping->host;
+       struct page *page;
+       struct buffer_head bh;
+       unsigned long vaddr = (unsigned long)vmf->virtual_address;
+       unsigned blkbits = inode->i_blkbits;
+       sector_t block;
+       pgoff_t size;
+       int error;
+       int major = 0;
+
+       size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+       if (vmf->pgoff >= size)
+               return VM_FAULT_SIGBUS;
+
+       memset(&bh, 0, sizeof(bh));
+       block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
+       bh.b_size = PAGE_SIZE;
+
+ repeat:
+       page = find_get_page(mapping, vmf->pgoff);
+       if (page) {
+               if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
+                       page_cache_release(page);
+                       return VM_FAULT_RETRY;
+               }
+               if (unlikely(page->mapping != mapping)) {
+                       unlock_page(page);
+                       page_cache_release(page);
+                       goto repeat;
+               }
+               size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+               if (unlikely(vmf->pgoff >= size)) {
+                       /*
+                        * We have a struct page covering a hole in the file
+                        * from a read fault and we've raced with a truncate
+                        */
+                       error = -EIO;
+                       goto unlock_page;
+               }
+       }
+
+       error = get_block(inode, block, &bh, 0);
+       if (!error && (bh.b_size < PAGE_SIZE))
+               error = -EIO;           /* fs corruption? */
+       if (error)
+               goto unlock_page;
+
+       if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
+               if (vmf->flags & FAULT_FLAG_WRITE) {
+                       error = get_block(inode, block, &bh, 1);
+                       count_vm_event(PGMAJFAULT);
+                       mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+                       major = VM_FAULT_MAJOR;
+                       if (!error && (bh.b_size < PAGE_SIZE))
+                               error = -EIO;
+                       if (error)
+                               goto unlock_page;
+               } else {
+                       return dax_load_hole(mapping, page, vmf);
+               }
+       }
+
+       if (vmf->cow_page) {
+               struct page *new_page = vmf->cow_page;
+               if (buffer_written(&bh))
+                       error = copy_user_bh(new_page, &bh, blkbits, vaddr);
+               else
+                       clear_user_highpage(new_page, vaddr);
+               if (error)
+                       goto unlock_page;
+               vmf->page = page;
+               if (!page) {
+                       i_mmap_lock_read(mapping);
+                       /* Check we didn't race with truncate */
+                       size = (i_size_read(inode) + PAGE_SIZE - 1) >>
+                                                               PAGE_SHIFT;
+                       if (vmf->pgoff >= size) {
+                               i_mmap_unlock_read(mapping);
+                               error = -EIO;
+                               goto out;
+                       }
+               }
+               return VM_FAULT_LOCKED;
+       }
+
+       /* Check we didn't race with a read fault installing a new page */
+       if (!page && major)
+               page = find_lock_page(mapping, vmf->pgoff);
+
+       if (page) {
+               unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
+                                                       PAGE_CACHE_SIZE, 0);
+               delete_from_page_cache(page);
+               unlock_page(page);
+               page_cache_release(page);
+       }
+
+       error = dax_insert_mapping(inode, &bh, vma, vmf);
+
+ out:
+       if (error == -ENOMEM)
+               return VM_FAULT_OOM | major;
+       /* -EBUSY is fine, somebody else faulted on the same PTE */
+       if ((error < 0) && (error != -EBUSY))
+               return VM_FAULT_SIGBUS | major;
+       return VM_FAULT_NOPAGE | major;
+
+ unlock_page:
+       if (page) {
+               unlock_page(page);
+               page_cache_release(page);
+       }
+       goto out;
+}
+
+/**
+ * dax_fault - handle a page fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * When a page fault occurs, filesystems may call this helper in their
+ * fault handler for DAX files.
+ */
+int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+                       get_block_t get_block)
+{
+       int result;
+       struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+
+       if (vmf->flags & FAULT_FLAG_WRITE) {
+               sb_start_pagefault(sb);
+               file_update_time(vma->vm_file);
+       }
+       result = do_dax_fault(vma, vmf, get_block);
+       if (vmf->flags & FAULT_FLAG_WRITE)
+               sb_end_pagefault(sb);
+
+       return result;
+}
+EXPORT_SYMBOL_GPL(dax_fault);
+
+/**
+ * dax_zero_page_range - zero a range within a page of a DAX file
+ * @inode: The file being truncated
+ * @from: The file offset that is being truncated to
+ * @length: The number of bytes to zero
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * This function can be called by a filesystem when it is zeroing part of a
+ * page in a DAX file.  This is intended for hole-punch operations.  If
+ * you are truncating a file, the helper function dax_truncate_page() may be
+ * more convenient.
+ *
+ * We work in terms of PAGE_CACHE_SIZE here for commonality with
+ * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
+ * took care of disposing of the unnecessary blocks.  Even if the filesystem
+ * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
+ * since the file might be mmapped.
+ */
+int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
+                                                       get_block_t get_block)
+{
+       struct buffer_head bh;
+       pgoff_t index = from >> PAGE_CACHE_SHIFT;
+       unsigned offset = from & (PAGE_CACHE_SIZE-1);
+       int err;
+
+       /* Block boundary? Nothing to do */
+       if (!length)
+               return 0;
+       BUG_ON((offset + length) > PAGE_CACHE_SIZE);
+
+       memset(&bh, 0, sizeof(bh));
+       bh.b_size = PAGE_CACHE_SIZE;
+       err = get_block(inode, index, &bh, 0);
+       if (err < 0)
+               return err;
+       if (buffer_written(&bh)) {
+               void *addr;
+               err = dax_get_addr(&bh, &addr, inode->i_blkbits);
+               if (err < 0)
+                       return err;
+               memset(addr + offset, 0, length);
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(dax_zero_page_range);
+
+/**
+ * dax_truncate_page - handle a partial page being truncated in a DAX file
+ * @inode: The file being truncated
+ * @from: The file offset that is being truncated to
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * Similar to block_truncate_page(), this function can be called by a
+ * filesystem when it is truncating a DAX file to handle the partial page.
+ *
+ * We work in terms of PAGE_CACHE_SIZE here for commonality with
+ * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
+ * took care of disposing of the unnecessary blocks.  Even if the filesystem
+ * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
+ * since the file might be mmapped.
+ */
+int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
+{
+       unsigned length = PAGE_CACHE_ALIGN(from) - from;
+       return dax_zero_page_range(inode, from, length, get_block);
+}
+EXPORT_SYMBOL_GPL(dax_truncate_page);
index 6fc91df99ff8aa0957860b268d732bc3a0d86fe4..a198e94813fec42378c75c568b013f360c5ea1e2 100644 (file)
@@ -985,7 +985,6 @@ const struct address_space_operations exofs_aops = {
        .direct_IO      = exofs_direct_IO,
 
        /* With these NULL has special meaning or default is not exported */
-       .get_xip_mem    = NULL,
        .migratepage    = NULL,
        .launder_page   = NULL,
        .is_partially_uptodate = NULL,
index 14a6780fd034bfc1db2cf782218ce1c0b349059e..c634874e12d969fbd0b00ad8da745553168876e6 100644 (file)
@@ -42,14 +42,3 @@ config EXT2_FS_SECURITY
 
          If you are not using a security module that requires using
          extended attributes for file security labels, say N.
-
-config EXT2_FS_XIP
-       bool "Ext2 execute in place support"
-       depends on EXT2_FS && MMU
-       help
-         Execute in place can be used on memory-backed block devices. If you
-         enable this option, you can select to mount block devices which are
-         capable of this feature without using the page cache.
-
-         If you do not use a block device that is capable of using this,
-         or if unsure, say N.
index f42af45cfd88a08a65977a544ad482a9e2d8341c..445b0e996a122a8ea82705233fefb6bc2de5166b 100644 (file)
@@ -10,4 +10,3 @@ ext2-y := balloc.o dir.o file.o ialloc.o inode.o \
 ext2-$(CONFIG_EXT2_FS_XATTR)    += xattr.o xattr_user.o xattr_trusted.o
 ext2-$(CONFIG_EXT2_FS_POSIX_ACL) += acl.o
 ext2-$(CONFIG_EXT2_FS_SECURITY)         += xattr_security.o
-ext2-$(CONFIG_EXT2_FS_XIP)      += xip.o
index e4279ead4a05d1972a1791c2698728503859c081..678f9ab08c486c5b8384437709fda03a070f8746 100644 (file)
@@ -380,10 +380,15 @@ struct ext2_inode {
 #define EXT2_MOUNT_NO_UID32            0x000200  /* Disable 32-bit UIDs */
 #define EXT2_MOUNT_XATTR_USER          0x004000  /* Extended user attributes */
 #define EXT2_MOUNT_POSIX_ACL           0x008000  /* POSIX Access Control Lists */
-#define EXT2_MOUNT_XIP                 0x010000  /* Execute in place */
+#define EXT2_MOUNT_XIP                 0x010000  /* Obsolete, use DAX */
 #define EXT2_MOUNT_USRQUOTA            0x020000  /* user quota */
 #define EXT2_MOUNT_GRPQUOTA            0x040000  /* group quota */
 #define EXT2_MOUNT_RESERVATION         0x080000  /* Preallocation */
+#ifdef CONFIG_FS_DAX
+#define EXT2_MOUNT_DAX                 0x100000  /* Direct Access */
+#else
+#define EXT2_MOUNT_DAX                 0
+#endif
 
 
 #define clear_opt(o, opt)              o &= ~EXT2_MOUNT_##opt
@@ -788,11 +793,10 @@ extern int ext2_fsync(struct file *file, loff_t start, loff_t end,
                      int datasync);
 extern const struct inode_operations ext2_file_inode_operations;
 extern const struct file_operations ext2_file_operations;
-extern const struct file_operations ext2_xip_file_operations;
+extern const struct file_operations ext2_dax_file_operations;
 
 /* inode.c */
 extern const struct address_space_operations ext2_aops;
-extern const struct address_space_operations ext2_aops_xip;
 extern const struct address_space_operations ext2_nobh_aops;
 
 /* namei.c */
index 7c87b22a7228c4ce9ed3c915c64283e2aa72a328..e31701713516c7f24dee6847badf0e4098b947c8 100644 (file)
 #include "xattr.h"
 #include "acl.h"
 
+#ifdef CONFIG_FS_DAX
+static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       return dax_fault(vma, vmf, ext2_get_block);
+}
+
+static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       return dax_mkwrite(vma, vmf, ext2_get_block);
+}
+
+static const struct vm_operations_struct ext2_dax_vm_ops = {
+       .fault          = ext2_dax_fault,
+       .page_mkwrite   = ext2_dax_mkwrite,
+};
+
+static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+       if (!IS_DAX(file_inode(file)))
+               return generic_file_mmap(file, vma);
+
+       file_accessed(file);
+       vma->vm_ops = &ext2_dax_vm_ops;
+       vma->vm_flags |= VM_MIXEDMAP;
+       return 0;
+}
+#else
+#define ext2_file_mmap generic_file_mmap
+#endif
+
 /*
  * Called when filp is released. This happens when all file descriptors
  * for a single struct file are closed. Note that different open() calls
@@ -70,7 +100,7 @@ const struct file_operations ext2_file_operations = {
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext2_compat_ioctl,
 #endif
-       .mmap           = generic_file_mmap,
+       .mmap           = ext2_file_mmap,
        .open           = dquot_file_open,
        .release        = ext2_release_file,
        .fsync          = ext2_fsync,
@@ -78,16 +108,18 @@ const struct file_operations ext2_file_operations = {
        .splice_write   = iter_file_splice_write,
 };
 
-#ifdef CONFIG_EXT2_FS_XIP
-const struct file_operations ext2_xip_file_operations = {
+#ifdef CONFIG_FS_DAX
+const struct file_operations ext2_dax_file_operations = {
        .llseek         = generic_file_llseek,
-       .read           = xip_file_read,
-       .write          = xip_file_write,
+       .read           = new_sync_read,
+       .write          = new_sync_write,
+       .read_iter      = generic_file_read_iter,
+       .write_iter     = generic_file_write_iter,
        .unlocked_ioctl = ext2_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext2_compat_ioctl,
 #endif
-       .mmap           = xip_file_mmap,
+       .mmap           = ext2_file_mmap,
        .open           = dquot_file_open,
        .release        = ext2_release_file,
        .fsync          = ext2_fsync,
index 36d35c36311d69a025c5b804e8d8597cbd9cb2b2..6434bc00012517a30ace1cb97f2160b0c48eea3a 100644 (file)
@@ -34,7 +34,6 @@
 #include <linux/aio.h>
 #include "ext2.h"
 #include "acl.h"
-#include "xip.h"
 #include "xattr.h"
 
 static int __ext2_write_inode(struct inode *inode, int do_sync);
@@ -731,12 +730,14 @@ static int ext2_get_blocks(struct inode *inode,
                goto cleanup;
        }
 
-       if (ext2_use_xip(inode->i_sb)) {
+       if (IS_DAX(inode)) {
                /*
-                * we need to clear the block
+                * block must be initialised before we put it in the tree
+                * so that it's not found by another thread before it's
+                * initialised
                 */
-               err = ext2_clear_xip_target (inode,
-                       le32_to_cpu(chain[depth-1].key));
+               err = dax_clear_blocks(inode, le32_to_cpu(chain[depth-1].key),
+                                               1 << inode->i_blkbits);
                if (err) {
                        mutex_unlock(&ei->truncate_mutex);
                        goto cleanup;
@@ -859,7 +860,12 @@ ext2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
        size_t count = iov_iter_count(iter);
        ssize_t ret;
 
-       ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext2_get_block);
+       if (IS_DAX(inode))
+               ret = dax_do_io(rw, iocb, inode, iter, offset, ext2_get_block,
+                               NULL, DIO_LOCKING);
+       else
+               ret = blockdev_direct_IO(rw, iocb, inode, iter, offset,
+                                        ext2_get_block);
        if (ret < 0 && (rw & WRITE))
                ext2_write_failed(mapping, offset + count);
        return ret;
@@ -885,11 +891,6 @@ const struct address_space_operations ext2_aops = {
        .error_remove_page      = generic_error_remove_page,
 };
 
-const struct address_space_operations ext2_aops_xip = {
-       .bmap                   = ext2_bmap,
-       .get_xip_mem            = ext2_get_xip_mem,
-};
-
 const struct address_space_operations ext2_nobh_aops = {
        .readpage               = ext2_readpage,
        .readpages              = ext2_readpages,
@@ -1201,8 +1202,8 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
 
        inode_dio_wait(inode);
 
-       if (mapping_is_xip(inode->i_mapping))
-               error = xip_truncate_page(inode->i_mapping, newsize);
+       if (IS_DAX(inode))
+               error = dax_truncate_page(inode, newsize, ext2_get_block);
        else if (test_opt(inode->i_sb, NOBH))
                error = nobh_truncate_page(inode->i_mapping,
                                newsize, ext2_get_block);
@@ -1273,7 +1274,8 @@ void ext2_set_inode_flags(struct inode *inode)
 {
        unsigned int flags = EXT2_I(inode)->i_flags;
 
-       inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+       inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
+                               S_DIRSYNC | S_DAX);
        if (flags & EXT2_SYNC_FL)
                inode->i_flags |= S_SYNC;
        if (flags & EXT2_APPEND_FL)
@@ -1284,6 +1286,8 @@ void ext2_set_inode_flags(struct inode *inode)
                inode->i_flags |= S_NOATIME;
        if (flags & EXT2_DIRSYNC_FL)
                inode->i_flags |= S_DIRSYNC;
+       if (test_opt(inode->i_sb, DAX))
+               inode->i_flags |= S_DAX;
 }
 
 /* Propagate flags from i_flags to EXT2_I(inode)->i_flags */
@@ -1384,9 +1388,9 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
 
        if (S_ISREG(inode->i_mode)) {
                inode->i_op = &ext2_file_inode_operations;
-               if (ext2_use_xip(inode->i_sb)) {
-                       inode->i_mapping->a_ops = &ext2_aops_xip;
-                       inode->i_fop = &ext2_xip_file_operations;
+               if (test_opt(inode->i_sb, DAX)) {
+                       inode->i_mapping->a_ops = &ext2_aops;
+                       inode->i_fop = &ext2_dax_file_operations;
                } else if (test_opt(inode->i_sb, NOBH)) {
                        inode->i_mapping->a_ops = &ext2_nobh_aops;
                        inode->i_fop = &ext2_file_operations;
index c268d0af1db93c5c9b3db0dd93dd6819d977b42e..148f6e3789eaea4fa7937ef019d9dbf23af36b60 100644 (file)
@@ -35,7 +35,6 @@
 #include "ext2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "xip.h"
 
 static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode)
 {
@@ -105,9 +104,9 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode
                return PTR_ERR(inode);
 
        inode->i_op = &ext2_file_inode_operations;
-       if (ext2_use_xip(inode->i_sb)) {
-               inode->i_mapping->a_ops = &ext2_aops_xip;
-               inode->i_fop = &ext2_xip_file_operations;
+       if (test_opt(inode->i_sb, DAX)) {
+               inode->i_mapping->a_ops = &ext2_aops;
+               inode->i_fop = &ext2_dax_file_operations;
        } else if (test_opt(inode->i_sb, NOBH)) {
                inode->i_mapping->a_ops = &ext2_nobh_aops;
                inode->i_fop = &ext2_file_operations;
@@ -126,9 +125,9 @@ static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
                return PTR_ERR(inode);
 
        inode->i_op = &ext2_file_inode_operations;
-       if (ext2_use_xip(inode->i_sb)) {
-               inode->i_mapping->a_ops = &ext2_aops_xip;
-               inode->i_fop = &ext2_xip_file_operations;
+       if (test_opt(inode->i_sb, DAX)) {
+               inode->i_mapping->a_ops = &ext2_aops;
+               inode->i_fop = &ext2_dax_file_operations;
        } else if (test_opt(inode->i_sb, NOBH)) {
                inode->i_mapping->a_ops = &ext2_nobh_aops;
                inode->i_fop = &ext2_file_operations;
index ae55fddc26a9d46f5ad7c3c2910150b9557ee235..d0e746e965118f9dd6410f1b65396e5650cf483a 100644 (file)
@@ -35,7 +35,6 @@
 #include "ext2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "xip.h"
 
 static void ext2_sync_super(struct super_block *sb,
                            struct ext2_super_block *es, int wait);
@@ -292,9 +291,11 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root)
                seq_puts(seq, ",grpquota");
 #endif
 
-#if defined(CONFIG_EXT2_FS_XIP)
+#ifdef CONFIG_FS_DAX
        if (sbi->s_mount_opt & EXT2_MOUNT_XIP)
                seq_puts(seq, ",xip");
+       if (sbi->s_mount_opt & EXT2_MOUNT_DAX)
+               seq_puts(seq, ",dax");
 #endif
 
        if (!test_opt(sb, RESERVATION))
@@ -403,7 +404,7 @@ enum {
        Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic,
        Opt_err_ro, Opt_nouid32, Opt_nocheck, Opt_debug,
        Opt_oldalloc, Opt_orlov, Opt_nobh, Opt_user_xattr, Opt_nouser_xattr,
-       Opt_acl, Opt_noacl, Opt_xip, Opt_ignore, Opt_err, Opt_quota,
+       Opt_acl, Opt_noacl, Opt_xip, Opt_dax, Opt_ignore, Opt_err, Opt_quota,
        Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation
 };
 
@@ -432,6 +433,7 @@ static const match_table_t tokens = {
        {Opt_acl, "acl"},
        {Opt_noacl, "noacl"},
        {Opt_xip, "xip"},
+       {Opt_dax, "dax"},
        {Opt_grpquota, "grpquota"},
        {Opt_ignore, "noquota"},
        {Opt_quota, "quota"},
@@ -559,10 +561,14 @@ static int parse_options(char *options, struct super_block *sb)
                        break;
 #endif
                case Opt_xip:
-#ifdef CONFIG_EXT2_FS_XIP
-                       set_opt (sbi->s_mount_opt, XIP);
+                       ext2_msg(sb, KERN_INFO, "use dax instead of xip");
+                       set_opt(sbi->s_mount_opt, XIP);
+                       /* Fall through */
+               case Opt_dax:
+#ifdef CONFIG_FS_DAX
+                       set_opt(sbi->s_mount_opt, DAX);
 #else
-                       ext2_msg(sb, KERN_INFO, "xip option not supported");
+                       ext2_msg(sb, KERN_INFO, "dax option not supported");
 #endif
                        break;
 
@@ -877,9 +883,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
                ((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ?
                 MS_POSIXACL : 0);
 
-       ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset
-                                   EXT2_MOUNT_XIP if not */
-
        if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV &&
            (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) ||
             EXT2_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
@@ -909,11 +912,17 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 
        blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
 
-       if (ext2_use_xip(sb) && blocksize != PAGE_SIZE) {
-               if (!silent)
+       if (sbi->s_mount_opt & EXT2_MOUNT_DAX) {
+               if (blocksize != PAGE_SIZE) {
                        ext2_msg(sb, KERN_ERR,
-                               "error: unsupported blocksize for xip");
-               goto failed_mount;
+                                       "error: unsupported blocksize for dax");
+                       goto failed_mount;
+               }
+               if (!sb->s_bdev->bd_disk->fops->direct_access) {
+                       ext2_msg(sb, KERN_ERR,
+                                       "error: device does not support dax");
+                       goto failed_mount;
+               }
        }
 
        /* If the blocksize doesn't match, re-read the thing.. */
@@ -1259,7 +1268,6 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 {
        struct ext2_sb_info * sbi = EXT2_SB(sb);
        struct ext2_super_block * es;
-       unsigned long old_mount_opt = sbi->s_mount_opt;
        struct ext2_mount_options old_opts;
        unsigned long old_sb_flags;
        int err;
@@ -1284,22 +1292,11 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
                ((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
 
-       ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset
-                                   EXT2_MOUNT_XIP if not */
-
-       if ((ext2_use_xip(sb)) && (sb->s_blocksize != PAGE_SIZE)) {
-               ext2_msg(sb, KERN_WARNING,
-                       "warning: unsupported blocksize for xip");
-               err = -EINVAL;
-               goto restore_opts;
-       }
-
        es = sbi->s_es;
-       if ((sbi->s_mount_opt ^ old_mount_opt) & EXT2_MOUNT_XIP) {
+       if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT2_MOUNT_DAX) {
                ext2_msg(sb, KERN_WARNING, "warning: refusing change of "
-                        "xip flag with busy inodes while remounting");
-               sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
-               sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
+                        "dax flag with busy inodes while remounting");
+               sbi->s_mount_opt ^= EXT2_MOUNT_DAX;
        }
        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
                spin_unlock(&sbi->s_lock);
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
deleted file mode 100644 (file)
index bbc5fec..0000000
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- *  linux/fs/ext2/xip.c
- *
- * Copyright (C) 2005 IBM Corporation
- * Author: Carsten Otte (cotte@de.ibm.com)
- */
-
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/genhd.h>
-#include <linux/buffer_head.h>
-#include <linux/blkdev.h>
-#include "ext2.h"
-#include "xip.h"
-
-static inline long __inode_direct_access(struct inode *inode, sector_t block,
-                               void **kaddr, unsigned long *pfn, long size)
-{
-       struct block_device *bdev = inode->i_sb->s_bdev;
-       sector_t sector = block * (PAGE_SIZE / 512);
-       return bdev_direct_access(bdev, sector, kaddr, pfn, size);
-}
-
-static inline int
-__ext2_get_block(struct inode *inode, pgoff_t pgoff, int create,
-                  sector_t *result)
-{
-       struct buffer_head tmp;
-       int rc;
-
-       memset(&tmp, 0, sizeof(struct buffer_head));
-       tmp.b_size = 1 << inode->i_blkbits;
-       rc = ext2_get_block(inode, pgoff, &tmp, create);
-       *result = tmp.b_blocknr;
-
-       /* did we get a sparse block (hole in the file)? */
-       if (!tmp.b_blocknr && !rc) {
-               BUG_ON(create);
-               rc = -ENODATA;
-       }
-
-       return rc;
-}
-
-int
-ext2_clear_xip_target(struct inode *inode, sector_t block)
-{
-       void *kaddr;
-       unsigned long pfn;
-       long size;
-
-       size = __inode_direct_access(inode, block, &kaddr, &pfn, PAGE_SIZE);
-       if (size < 0)
-               return size;
-       clear_page(kaddr);
-       return 0;
-}
-
-void ext2_xip_verify_sb(struct super_block *sb)
-{
-       struct ext2_sb_info *sbi = EXT2_SB(sb);
-
-       if ((sbi->s_mount_opt & EXT2_MOUNT_XIP) &&
-           !sb->s_bdev->bd_disk->fops->direct_access) {
-               sbi->s_mount_opt &= (~EXT2_MOUNT_XIP);
-               ext2_msg(sb, KERN_WARNING,
-                            "warning: ignoring xip option - "
-                            "not supported by bdev");
-       }
-}
-
-int ext2_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create,
-                               void **kmem, unsigned long *pfn)
-{
-       long rc;
-       sector_t block;
-
-       /* first, retrieve the sector number */
-       rc = __ext2_get_block(mapping->host, pgoff, create, &block);
-       if (rc)
-               return rc;
-
-       /* retrieve address of the target data */
-       rc = __inode_direct_access(mapping->host, block, kmem, pfn, PAGE_SIZE);
-       return (rc < 0) ? rc : 0;
-}
diff --git a/fs/ext2/xip.h b/fs/ext2/xip.h
deleted file mode 100644 (file)
index 18b34d2..0000000
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- *  linux/fs/ext2/xip.h
- *
- * Copyright (C) 2005 IBM Corporation
- * Author: Carsten Otte (cotte@de.ibm.com)
- */
-
-#ifdef CONFIG_EXT2_FS_XIP
-extern void ext2_xip_verify_sb (struct super_block *);
-extern int ext2_clear_xip_target (struct inode *, sector_t);
-
-static inline int ext2_use_xip (struct super_block *sb)
-{
-       struct ext2_sb_info *sbi = EXT2_SB(sb);
-       return (sbi->s_mount_opt & EXT2_MOUNT_XIP);
-}
-int ext2_get_xip_mem(struct address_space *, pgoff_t, int,
-                               void **, unsigned long *);
-#define mapping_is_xip(map) unlikely(map->a_ops->get_xip_mem)
-#else
-#define mapping_is_xip(map)                    0
-#define ext2_xip_verify_sb(sb)                 do { } while (0)
-#define ext2_use_xip(sb)                       0
-#define ext2_clear_xip_target(inode, chain)    0
-#define ext2_get_xip_mem                       NULL
-#endif
index a75fba67bb1f197ba83143484a2a93de0b0dabe9..982d934fd9ac98338377d3b1621b3d577531b6e6 100644 (file)
@@ -965,6 +965,11 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_ERRORS_MASK         0x00070
 #define EXT4_MOUNT_MINIX_DF            0x00080 /* Mimics the Minix statfs */
 #define EXT4_MOUNT_NOLOAD              0x00100 /* Don't use existing journal*/
+#ifdef CONFIG_FS_DAX
+#define EXT4_MOUNT_DAX                 0x00200 /* Direct Access */
+#else
+#define EXT4_MOUNT_DAX                 0
+#endif
 #define EXT4_MOUNT_DATA_FLAGS          0x00C00 /* Mode for data writes: */
 #define EXT4_MOUNT_JOURNAL_DATA                0x00400 /* Write data to journal */
 #define EXT4_MOUNT_ORDERED_DATA                0x00800 /* Flush data before commit */
@@ -2578,6 +2583,7 @@ extern const struct file_operations ext4_dir_operations;
 /* file.c */
 extern const struct inode_operations ext4_file_inode_operations;
 extern const struct file_operations ext4_file_operations;
+extern const struct file_operations ext4_dax_file_operations;
 extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
 
 /* inline.c */
index 7cb5923861211999fe7c3bd627fee9ca9d1f8d7d..33a09da16c9ce1e8049fdcacdf3e8833410fd78f 100644 (file)
@@ -95,7 +95,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
        struct inode *inode = file_inode(iocb->ki_filp);
        struct mutex *aio_mutex = NULL;
        struct blk_plug plug;
-       int o_direct = file->f_flags & O_DIRECT;
+       int o_direct = io_is_direct(file);
        int overwrite = 0;
        size_t length = iov_iter_count(from);
        ssize_t ret;
@@ -191,6 +191,26 @@ errout:
        return ret;
 }
 
+#ifdef CONFIG_FS_DAX
+static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       return dax_fault(vma, vmf, ext4_get_block);
+                                       /* Is this the right get_block? */
+}
+
+static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       return dax_mkwrite(vma, vmf, ext4_get_block);
+}
+
+static const struct vm_operations_struct ext4_dax_vm_ops = {
+       .fault          = ext4_dax_fault,
+       .page_mkwrite   = ext4_dax_mkwrite,
+};
+#else
+#define ext4_dax_vm_ops        ext4_file_vm_ops
+#endif
+
 static const struct vm_operations_struct ext4_file_vm_ops = {
        .fault          = filemap_fault,
        .map_pages      = filemap_map_pages,
@@ -200,7 +220,12 @@ static const struct vm_operations_struct ext4_file_vm_ops = {
 static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
        file_accessed(file);
-       vma->vm_ops = &ext4_file_vm_ops;
+       if (IS_DAX(file_inode(file))) {
+               vma->vm_ops = &ext4_dax_vm_ops;
+               vma->vm_flags |= VM_MIXEDMAP;
+       } else {
+               vma->vm_ops = &ext4_file_vm_ops;
+       }
        return 0;
 }
 
@@ -599,6 +624,26 @@ const struct file_operations ext4_file_operations = {
        .fallocate      = ext4_fallocate,
 };
 
+#ifdef CONFIG_FS_DAX
+const struct file_operations ext4_dax_file_operations = {
+       .llseek         = ext4_llseek,
+       .read           = new_sync_read,
+       .write          = new_sync_write,
+       .read_iter      = generic_file_read_iter,
+       .write_iter     = ext4_file_write_iter,
+       .unlocked_ioctl = ext4_ioctl,
+#ifdef CONFIG_COMPAT
+       .compat_ioctl   = ext4_compat_ioctl,
+#endif
+       .mmap           = ext4_file_mmap,
+       .open           = ext4_file_open,
+       .release        = ext4_release_file,
+       .fsync          = ext4_sync_file,
+       /* Splice not yet supported with DAX */
+       .fallocate      = ext4_fallocate,
+};
+#endif
+
 const struct inode_operations ext4_file_inode_operations = {
        .setattr        = ext4_setattr,
        .getattr        = ext4_getattr,
index 36b369697a131523f26f6a6336eec17b4da3ae4f..6b9878a24182b06125cb496ef973ff0d9b739106 100644 (file)
@@ -689,14 +689,22 @@ retry:
                        inode_dio_done(inode);
                        goto locked;
                }
-               ret = __blockdev_direct_IO(rw, iocb, inode,
-                                inode->i_sb->s_bdev, iter, offset,
-                                ext4_get_block, NULL, NULL, 0);
+               if (IS_DAX(inode))
+                       ret = dax_do_io(rw, iocb, inode, iter, offset,
+                                       ext4_get_block, NULL, 0);
+               else
+                       ret = __blockdev_direct_IO(rw, iocb, inode,
+                                       inode->i_sb->s_bdev, iter, offset,
+                                       ext4_get_block, NULL, NULL, 0);
                inode_dio_done(inode);
        } else {
 locked:
-               ret = blockdev_direct_IO(rw, iocb, inode, iter,
-                                offset, ext4_get_block);
+               if (IS_DAX(inode))
+                       ret = dax_do_io(rw, iocb, inode, iter, offset,
+                                       ext4_get_block, NULL, DIO_LOCKING);
+               else
+                       ret = blockdev_direct_IO(rw, iocb, inode, iter,
+                                       offset, ext4_get_block);
 
                if (unlikely((rw & WRITE) && ret < 0)) {
                        loff_t isize = i_size_read(inode);
index 5653fa42930b6afbe29b5dce9e88511c502c8e54..28555f191b62b93ff5fd8a75de45d37aa9d7223e 100644 (file)
@@ -657,6 +657,18 @@ has_zeroout:
        return retval;
 }
 
+static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
+{
+       struct inode *inode = bh->b_assoc_map->host;
+       /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
+       loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
+       int err;
+       if (!uptodate)
+               return;
+       WARN_ON(!buffer_unwritten(bh));
+       err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
+}
+
 /* Maximum number of blocks we map for direct IO at once. */
 #define DIO_MAX_BLOCKS 4096
 
@@ -694,6 +706,11 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
 
                map_bh(bh, inode->i_sb, map.m_pblk);
                bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+               if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
+                       bh->b_assoc_map = inode->i_mapping;
+                       bh->b_private = (void *)(unsigned long)iblock;
+                       bh->b_end_io = ext4_end_io_unwritten;
+               }
                if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
                        set_buffer_defer_completion(bh);
                bh->b_size = inode->i_sb->s_blocksize * map.m_len;
@@ -3010,13 +3027,14 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                get_block_func = ext4_get_block_write;
                dio_flags = DIO_LOCKING;
        }
-       ret = __blockdev_direct_IO(rw, iocb, inode,
-                                  inode->i_sb->s_bdev, iter,
-                                  offset,
-                                  get_block_func,
-                                  ext4_end_io_dio,
-                                  NULL,
-                                  dio_flags);
+       if (IS_DAX(inode))
+               ret = dax_do_io(rw, iocb, inode, iter, offset, get_block_func,
+                               ext4_end_io_dio, dio_flags);
+       else
+               ret = __blockdev_direct_IO(rw, iocb, inode,
+                                          inode->i_sb->s_bdev, iter, offset,
+                                          get_block_func,
+                                          ext4_end_io_dio, NULL, dio_flags);
 
        /*
         * Put our reference to io_end. This can free the io_end structure e.g.
@@ -3180,19 +3198,12 @@ void ext4_set_aops(struct inode *inode)
                inode->i_mapping->a_ops = &ext4_aops;
 }
 
-/*
- * ext4_block_zero_page_range() zeros out a mapping of length 'length'
- * starting from file offset 'from'.  The range to be zero'd must
- * be contained with in one block.  If the specified range exceeds
- * the end of the block it will be shortened to end of the block
- * that cooresponds to 'from'
- */
-static int ext4_block_zero_page_range(handle_t *handle,
+static int __ext4_block_zero_page_range(handle_t *handle,
                struct address_space *mapping, loff_t from, loff_t length)
 {
        ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
        unsigned offset = from & (PAGE_CACHE_SIZE-1);
-       unsigned blocksize, max, pos;
+       unsigned blocksize, pos;
        ext4_lblk_t iblock;
        struct inode *inode = mapping->host;
        struct buffer_head *bh;
@@ -3205,14 +3216,6 @@ static int ext4_block_zero_page_range(handle_t *handle,
                return -ENOMEM;
 
        blocksize = inode->i_sb->s_blocksize;
-       max = blocksize - (offset & (blocksize - 1));
-
-       /*
-        * correct length if it does not fall between
-        * 'from' and the end of the block
-        */
-       if (length > max || length < 0)
-               length = max;
 
        iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
 
@@ -3277,6 +3280,33 @@ unlock:
        return err;
 }
 
+/*
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
+ * starting from file offset 'from'.  The range to be zero'd must
+ * be contained with in one block.  If the specified range exceeds
+ * the end of the block it will be shortened to end of the block
+ * that cooresponds to 'from'
+ */
+static int ext4_block_zero_page_range(handle_t *handle,
+               struct address_space *mapping, loff_t from, loff_t length)
+{
+       struct inode *inode = mapping->host;
+       unsigned offset = from & (PAGE_CACHE_SIZE-1);
+       unsigned blocksize = inode->i_sb->s_blocksize;
+       unsigned max = blocksize - (offset & (blocksize - 1));
+
+       /*
+        * correct length if it does not fall between
+        * 'from' and the end of the block
+        */
+       if (length > max || length < 0)
+               length = max;
+
+       if (IS_DAX(inode))
+               return dax_zero_page_range(inode, from, length, ext4_get_block);
+       return __ext4_block_zero_page_range(handle, mapping, from, length);
+}
+
 /*
  * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
  * up to the end of the block which corresponds to `from'.
@@ -3798,8 +3828,10 @@ void ext4_set_inode_flags(struct inode *inode)
                new_fl |= S_NOATIME;
        if (flags & EXT4_DIRSYNC_FL)
                new_fl |= S_DIRSYNC;
+       if (test_opt(inode->i_sb, DAX))
+               new_fl |= S_DAX;
        inode_set_flags(inode, new_fl,
-                       S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+                       S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
 }
 
 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
@@ -4052,7 +4084,10 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 
        if (S_ISREG(inode->i_mode)) {
                inode->i_op = &ext4_file_inode_operations;
-               inode->i_fop = &ext4_file_operations;
+               if (test_opt(inode->i_sb, DAX))
+                       inode->i_fop = &ext4_dax_file_operations;
+               else
+                       inode->i_fop = &ext4_file_operations;
                ext4_set_aops(inode);
        } else if (S_ISDIR(inode->i_mode)) {
                inode->i_op = &ext4_dir_inode_operations;
@@ -4534,7 +4569,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                 * Truncate pagecache after we've waited for commit
                 * in data=journal mode to make pages freeable.
                 */
-                       truncate_pagecache(inode, inode->i_size);
+               truncate_pagecache(inode, inode->i_size);
        }
        /*
         * We want to call ext4_truncate() even if attr->ia_size ==
index 2291923dae4e6c4cf351dcdb7d88d59f9314590d..28fe71a2904c6de8c9774e67238d3a16a9b422a6 100644 (file)
@@ -2235,7 +2235,10 @@ retry:
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                inode->i_op = &ext4_file_inode_operations;
-               inode->i_fop = &ext4_file_operations;
+               if (test_opt(inode->i_sb, DAX))
+                       inode->i_fop = &ext4_dax_file_operations;
+               else
+                       inode->i_fop = &ext4_file_operations;
                ext4_set_aops(inode);
                err = ext4_add_nondir(handle, dentry, inode);
                if (!err && IS_DIRSYNC(dir))
@@ -2299,7 +2302,10 @@ retry:
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                inode->i_op = &ext4_file_inode_operations;
-               inode->i_fop = &ext4_file_operations;
+               if (test_opt(inode->i_sb, DAX))
+                       inode->i_fop = &ext4_dax_file_operations;
+               else
+                       inode->i_fop = &ext4_file_operations;
                ext4_set_aops(inode);
                d_tmpfile(dentry, inode);
                err = ext4_orphan_add(handle, inode);
index 64c39c7c594f723fdcf15a2c550609777d3b68f1..10e8c6b7ca08221bb1936da1f376c6b97e8a9d86 100644 (file)
@@ -1124,7 +1124,7 @@ enum {
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
        Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
-       Opt_usrquota, Opt_grpquota, Opt_i_version,
+       Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_dax,
        Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
        Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
        Opt_inode_readahead_blks, Opt_journal_ioprio,
@@ -1187,6 +1187,7 @@ static const match_table_t tokens = {
        {Opt_barrier, "barrier"},
        {Opt_nobarrier, "nobarrier"},
        {Opt_i_version, "i_version"},
+       {Opt_dax, "dax"},
        {Opt_stripe, "stripe=%u"},
        {Opt_delalloc, "delalloc"},
        {Opt_nodelalloc, "nodelalloc"},
@@ -1371,6 +1372,7 @@ static const struct mount_opts {
        {Opt_min_batch_time, 0, MOPT_GTE0},
        {Opt_inode_readahead_blks, 0, MOPT_GTE0},
        {Opt_init_itable, 0, MOPT_GTE0},
+       {Opt_dax, EXT4_MOUNT_DAX, MOPT_SET},
        {Opt_stripe, 0, MOPT_GTE0},
        {Opt_resuid, 0, MOPT_GTE0},
        {Opt_resgid, 0, MOPT_GTE0},
@@ -1606,6 +1608,11 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
                        return -1;
                }
                sbi->s_jquota_fmt = m->mount_opt;
+#endif
+#ifndef CONFIG_FS_DAX
+       } else if (token == Opt_dax) {
+               ext4_msg(sb, KERN_INFO, "dax option not supported");
+               return -1;
 #endif
        } else {
                if (!args->from)
@@ -3589,6 +3596,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                 "both data=journal and dioread_nolock");
                        goto failed_mount;
                }
+               if (test_opt(sb, DAX)) {
+                       ext4_msg(sb, KERN_ERR, "can't mount with "
+                                "both data=journal and dax");
+                       goto failed_mount;
+               }
                if (test_opt(sb, DELALLOC))
                        clear_opt(sb, DELALLOC);
        }
@@ -3652,6 +3664,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
 
+       if (sbi->s_mount_opt & EXT4_MOUNT_DAX) {
+               if (blocksize != PAGE_SIZE) {
+                       ext4_msg(sb, KERN_ERR,
+                                       "error: unsupported blocksize for dax");
+                       goto failed_mount;
+               }
+               if (!sb->s_bdev->bd_disk->fops->direct_access) {
+                       ext4_msg(sb, KERN_ERR,
+                                       "error: device does not support dax");
+                       goto failed_mount;
+               }
+       }
+
        if (sb->s_blocksize != blocksize) {
                /* Validate the filesystem blocksize */
                if (!sb_set_blocksize(sb, blocksize)) {
@@ -4869,6 +4894,18 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                        err = -EINVAL;
                        goto restore_opts;
                }
+               if (test_opt(sb, DAX)) {
+                       ext4_msg(sb, KERN_ERR, "can't mount with "
+                                "both data=journal and dax");
+                       err = -EINVAL;
+                       goto restore_opts;
+               }
+       }
+
+       if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {
+               ext4_msg(sb, KERN_WARNING, "warning: refusing change of "
+                       "dax flag with busy inodes while remounting");
+               sbi->s_mount_opt ^= EXT4_MOUNT_DAX;
        }
 
        if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
index 46d93e941f3d832c60ee50315adfd380b993b53a..44db1808cdb598df6b91548410b3634480c06c31 100644 (file)
@@ -28,6 +28,7 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/mpage.h>
 #include <linux/quotaops.h>
+#include <linux/blkdev.h>
 
 #include <cluster/masklog.h>
 
@@ -47,6 +48,9 @@
 #include "ocfs2_trace.h"
 
 #include "buffer_head_io.h"
+#include "dir.h"
+#include "namei.h"
+#include "sysfile.h"
 
 static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
                                   struct buffer_head *bh_result, int create)
@@ -506,18 +510,21 @@ bail:
  *
  * called like this: dio->get_blocks(dio->inode, fs_startblk,
  *                                     fs_count, map_bh, dio->rw == WRITE);
- *
- * Note that we never bother to allocate blocks here, and thus ignore the
- * create argument.
  */
 static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
                                     struct buffer_head *bh_result, int create)
 {
        int ret;
+       u32 cpos = 0;
+       int alloc_locked = 0;
        u64 p_blkno, inode_blocks, contig_blocks;
        unsigned int ext_flags;
        unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
        unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
+       unsigned long len = bh_result->b_size;
+       unsigned int clusters_to_alloc = 0;
+
+       cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock);
 
        /* This function won't even be called if the request isn't all
         * nicely aligned and of the right size, so there's no need
@@ -539,6 +546,40 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
        /* We should already CoW the refcounted extent in case of create. */
        BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
 
+       /* allocate blocks if no p_blkno is found, and create == 1 */
+       if (!p_blkno && create) {
+               ret = ocfs2_inode_lock(inode, NULL, 1);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       goto bail;
+               }
+
+               alloc_locked = 1;
+
+               /* fill hole, allocate blocks can't be larger than the size
+                * of the hole */
+               clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
+               if (clusters_to_alloc > contig_blocks)
+                       clusters_to_alloc = contig_blocks;
+
+               /* allocate extent and insert them into the extent tree */
+               ret = ocfs2_extend_allocation(inode, cpos,
+                               clusters_to_alloc, 0);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       goto bail;
+               }
+
+               ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
+                               &contig_blocks, &ext_flags);
+               if (ret < 0) {
+                       mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
+                                       (unsigned long long)iblock);
+                       ret = -EIO;
+                       goto bail;
+               }
+       }
+
        /*
         * get_more_blocks() expects us to describe a hole by clearing
         * the mapped bit on bh_result().
@@ -556,6 +597,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
                contig_blocks = max_blocks;
        bh_result->b_size = contig_blocks << blocksize_bits;
 bail:
+       if (alloc_locked)
+               ocfs2_inode_unlock(inode, 1);
        return ret;
 }
 
@@ -597,6 +640,184 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)
        return try_to_free_buffers(page);
 }
 
+static int ocfs2_is_overwrite(struct ocfs2_super *osb,
+               struct inode *inode, loff_t offset)
+{
+       int ret = 0;
+       u32 v_cpos = 0;
+       u32 p_cpos = 0;
+       unsigned int num_clusters = 0;
+       unsigned int ext_flags = 0;
+
+       v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
+       ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
+                       &num_clusters, &ext_flags);
+       if (ret < 0) {
+               mlog_errno(ret);
+               return ret;
+       }
+
+       if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN))
+               return 1;
+
+       return 0;
+}
+
+static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
+               struct iov_iter *iter,
+               loff_t offset)
+{
+       ssize_t ret = 0;
+       ssize_t written = 0;
+       bool orphaned = false;
+       int is_overwrite = 0;
+       struct file *file = iocb->ki_filp;
+       struct inode *inode = file_inode(file)->i_mapping->host;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       struct buffer_head *di_bh = NULL;
+       size_t count = iter->count;
+       journal_t *journal = osb->journal->j_journal;
+       u32 zero_len;
+       int cluster_align;
+       loff_t final_size = offset + count;
+       int append_write = offset >= i_size_read(inode) ? 1 : 0;
+       unsigned int num_clusters = 0;
+       unsigned int ext_flags = 0;
+
+       {
+               u64 o = offset;
+
+               zero_len = do_div(o, 1 << osb->s_clustersize_bits);
+               cluster_align = !zero_len;
+       }
+
+       /*
+        * when final_size > inode->i_size, inode->i_size will be
+        * updated after direct write, so add the inode to orphan
+        * dir first.
+        */
+       if (final_size > i_size_read(inode)) {
+               ret = ocfs2_add_inode_to_orphan(osb, inode);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+               orphaned = true;
+       }
+
+       if (append_write) {
+               ret = ocfs2_inode_lock(inode, &di_bh, 1);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       goto clean_orphan;
+               }
+
+               if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+                       ret = ocfs2_zero_extend(inode, di_bh, offset);
+               else
+                       ret = ocfs2_extend_no_holes(inode, di_bh, offset,
+                                       offset);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       ocfs2_inode_unlock(inode, 1);
+                       brelse(di_bh);
+                       goto clean_orphan;
+               }
+
+               is_overwrite = ocfs2_is_overwrite(osb, inode, offset);
+               if (is_overwrite < 0) {
+                       mlog_errno(is_overwrite);
+                       ocfs2_inode_unlock(inode, 1);
+                       brelse(di_bh);
+                       goto clean_orphan;
+               }
+
+               ocfs2_inode_unlock(inode, 1);
+               brelse(di_bh);
+               di_bh = NULL;
+       }
+
+       written = __blockdev_direct_IO(WRITE, iocb, inode, inode->i_sb->s_bdev,
+                       iter, offset,
+                       ocfs2_direct_IO_get_blocks,
+                       ocfs2_dio_end_io, NULL, 0);
+       if (unlikely(written < 0)) {
+               loff_t i_size = i_size_read(inode);
+
+               if (offset + count > i_size) {
+                       ret = ocfs2_inode_lock(inode, &di_bh, 1);
+                       if (ret < 0) {
+                               mlog_errno(ret);
+                               goto clean_orphan;
+                       }
+
+                       if (i_size == i_size_read(inode)) {
+                               ret = ocfs2_truncate_file(inode, di_bh,
+                                               i_size);
+                               if (ret < 0) {
+                                       if (ret != -ENOSPC)
+                                               mlog_errno(ret);
+
+                                       ocfs2_inode_unlock(inode, 1);
+                                       brelse(di_bh);
+                                       goto clean_orphan;
+                               }
+                       }
+
+                       ocfs2_inode_unlock(inode, 1);
+                       brelse(di_bh);
+
+                       ret = jbd2_journal_force_commit(journal);
+                       if (ret < 0)
+                               mlog_errno(ret);
+               }
+       } else if (written < 0 && append_write && !is_overwrite &&
+                       !cluster_align) {
+               u32 p_cpos = 0;
+               u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
+
+               ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
+                               &num_clusters, &ext_flags);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       goto clean_orphan;
+               }
+
+               BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN));
+
+               ret = blkdev_issue_zeroout(osb->sb->s_bdev,
+                               p_cpos << (osb->s_clustersize_bits - 9),
+                               zero_len >> 9, GFP_KERNEL, false);
+               if (ret < 0)
+                       mlog_errno(ret);
+       }
+
+clean_orphan:
+       if (orphaned) {
+               int tmp_ret;
+               int update_isize = written > 0 ? 1 : 0;
+               loff_t end = update_isize ? offset + written : 0;
+
+               tmp_ret = ocfs2_del_inode_from_orphan(osb, inode,
+                               update_isize, end);
+               if (tmp_ret < 0) {
+                       ret = tmp_ret;
+                       goto out;
+               }
+
+               tmp_ret = jbd2_journal_force_commit(journal);
+               if (tmp_ret < 0) {
+                       ret = tmp_ret;
+                       mlog_errno(tmp_ret);
+               }
+       }
+
+out:
+       if (ret >= 0)
+               ret = written;
+       return ret;
+}
+
 static ssize_t ocfs2_direct_IO(int rw,
                               struct kiocb *iocb,
                               struct iov_iter *iter,
@@ -604,6 +825,9 @@ static ssize_t ocfs2_direct_IO(int rw,
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file)->i_mapping->host;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       int full_coherency = !(osb->s_mount_opt &
+                       OCFS2_MOUNT_COHERENCY_BUFFERED);
 
        /*
         * Fallback to buffered I/O if we see an inode without
@@ -612,14 +836,20 @@ static ssize_t ocfs2_direct_IO(int rw,
        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
                return 0;
 
-       /* Fallback to buffered I/O if we are appending. */
-       if (i_size_read(inode) <= offset)
+       /* Fallback to buffered I/O if we are appending and
+        * concurrent O_DIRECT writes are allowed.
+        */
+       if (i_size_read(inode) <= offset && !full_coherency)
                return 0;
 
-       return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
+       if (rw == READ)
+               return __blockdev_direct_IO(rw, iocb, inode,
+                                   inode->i_sb->s_bdev,
                                    iter, offset,
                                    ocfs2_direct_IO_get_blocks,
                                    ocfs2_dio_end_io, NULL, 0);
+       else
+               return ocfs2_direct_IO_write(iocb, iter, offset);
 }
 
 static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
index e0f04d55fd0531e8be46260821c93b920af7bc7d..46e0d4e857c7f493f512196603d3725ca8d3dfaa 100644 (file)
@@ -295,7 +295,7 @@ out:
        return ret;
 }
 
-static int ocfs2_set_inode_size(handle_t *handle,
+int ocfs2_set_inode_size(handle_t *handle,
                                struct inode *inode,
                                struct buffer_head *fe_bh,
                                u64 new_i_size)
@@ -441,7 +441,7 @@ out:
        return status;
 }
 
-static int ocfs2_truncate_file(struct inode *inode,
+int ocfs2_truncate_file(struct inode *inode,
                               struct buffer_head *di_bh,
                               u64 new_i_size)
 {
@@ -709,6 +709,13 @@ leave:
        return status;
 }
 
+int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
+               u32 clusters_to_add, int mark_unwritten)
+{
+       return __ocfs2_extend_allocation(inode, logical_start,
+                       clusters_to_add, mark_unwritten);
+}
+
 /*
  * While a write will already be ordering the data, a truncate will not.
  * Thus, we need to explicitly order the zeroed pages.
@@ -2109,6 +2116,9 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
        struct dentry *dentry = file->f_path.dentry;
        struct inode *inode = dentry->d_inode;
        loff_t saved_pos = 0, end;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       int full_coherency = !(osb->s_mount_opt &
+               OCFS2_MOUNT_COHERENCY_BUFFERED);
 
        /*
         * We start with a read level meta lock and only jump to an ex
@@ -2197,7 +2207,16 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
                 * one node could wind up truncating another
                 * nodes writes.
                 */
-               if (end > i_size_read(inode)) {
+               if (end > i_size_read(inode) && !full_coherency) {
+                       *direct_io = 0;
+                       break;
+               }
+
+               /*
+                * Fallback to old way if the feature bit is not set.
+                */
+               if (end > i_size_read(inode) &&
+                               !ocfs2_supports_append_dio(osb)) {
                        *direct_io = 0;
                        break;
                }
@@ -2210,7 +2229,13 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
                 */
                ret = ocfs2_check_range_for_holes(inode, saved_pos, count);
                if (ret == 1) {
-                       *direct_io = 0;
+                       /*
+                        * Fallback to old way if the feature bit is not set.
+                        * Otherwise try dio first and then complete the rest
+                        * request through buffer io.
+                        */
+                       if (!ocfs2_supports_append_dio(osb))
+                               *direct_io = 0;
                        ret = 0;
                } else if (ret < 0)
                        mlog_errno(ret);
@@ -2243,6 +2268,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
        u32 old_clusters;
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
+       struct address_space *mapping = file->f_mapping;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        int full_coherency = !(osb->s_mount_opt &
                               OCFS2_MOUNT_COHERENCY_BUFFERED);
@@ -2357,11 +2383,51 @@ relock:
 
        iov_iter_truncate(from, count);
        if (direct_io) {
+               loff_t endbyte;
+               ssize_t written_buffered;
                written = generic_file_direct_write(iocb, from, *ppos);
-               if (written < 0) {
+               if (written < 0 || written == count) {
                        ret = written;
                        goto out_dio;
                }
+
+               /*
+                * for completing the rest of the request.
+                */
+               *ppos += written;
+               count -= written;
+               written_buffered = generic_perform_write(file, from, *ppos);
+               /*
+                * If generic_file_buffered_write() returned a synchronous error
+                * then we want to return the number of bytes which were
+                * direct-written, or the error code if that was zero. Note
+                * that this differs from normal direct-io semantics, which
+                * will return -EFOO even if some bytes were written.
+                */
+               if (written_buffered < 0) {
+                       ret = written_buffered;
+                       goto out_dio;
+               }
+
+               iocb->ki_pos = *ppos + written_buffered;
+               /* We need to ensure that the page cache pages are written to
+                * disk and invalidated to preserve the expected O_DIRECT
+                * semantics.
+                */
+               endbyte = *ppos + written_buffered - 1;
+               ret = filemap_write_and_wait_range(file->f_mapping, *ppos,
+                               endbyte);
+               if (ret == 0) {
+                       written += written_buffered;
+                       invalidate_mapping_pages(mapping,
+                                       *ppos >> PAGE_CACHE_SHIFT,
+                                       endbyte >> PAGE_CACHE_SHIFT);
+               } else {
+                       /*
+                        * We don't know how much we wrote, so just return
+                        * the number of bytes which were direct-written
+                        */
+               }
        } else {
                current->backing_dev_info = inode_to_bdi(inode);
                written = generic_perform_write(file, from, *ppos);
index 97bf761c9e7c7b8f1c1744bbada1d778095eb3d7..e8c62f22215c13b404dc572da2441c9f67cbfed2 100644 (file)
@@ -51,13 +51,22 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
                         struct ocfs2_alloc_context *data_ac,
                         struct ocfs2_alloc_context *meta_ac,
                         enum ocfs2_alloc_restarted *reason_ret);
+int ocfs2_set_inode_size(handle_t *handle,
+               struct inode *inode,
+               struct buffer_head *fe_bh,
+               u64 new_i_size);
 int ocfs2_simple_size_update(struct inode *inode,
                             struct buffer_head *di_bh,
                             u64 new_i_size);
+int ocfs2_truncate_file(struct inode *inode,
+               struct buffer_head *di_bh,
+               u64 new_i_size);
 int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
                          u64 new_i_size, u64 zero_to);
 int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
                      loff_t zero_to);
+int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
+               u32 clusters_to_add, int mark_unwritten);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
                  struct kstat *stat);
index c8b25de9efbbf93217480059fdca6f44e7513d56..3025c0da6b8abd6358a60581087a257dd4f24e6f 100644 (file)
@@ -648,7 +648,7 @@ static int ocfs2_remove_inode(struct inode *inode,
 
        if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
                status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
-                                         orphan_dir_bh);
+                                         orphan_dir_bh, false);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail_commit;
index ca3431ee7f2493fb999cdb6ced2908bdb87fbe17..5e86b247c821ce8434dfa5c9d5fe59798a8cde86 100644 (file)
@@ -81,6 +81,8 @@ struct ocfs2_inode_info
        tid_t i_sync_tid;
        tid_t i_datasync_tid;
 
+       wait_queue_head_t append_dio_wq;
+
        struct dquot *i_dquot[MAXQUOTAS];
 };
 
index d10860fde16545e464610484b93be871aab5aa1a..ff531928269ed1d4e1e0f79bfd6335db3cbc3671 100644 (file)
@@ -50,6 +50,8 @@
 #include "sysfile.h"
 #include "uptodate.h"
 #include "quota.h"
+#include "file.h"
+#include "namei.h"
 
 #include "buffer_head_io.h"
 #include "ocfs2_trace.h"
@@ -69,13 +71,15 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
 static int ocfs2_trylock_journal(struct ocfs2_super *osb,
                                 int slot_num);
 static int ocfs2_recover_orphans(struct ocfs2_super *osb,
-                                int slot);
+                                int slot,
+                                enum ocfs2_orphan_reco_type orphan_reco_type);
 static int ocfs2_commit_thread(void *arg);
 static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
                                            int slot_num,
                                            struct ocfs2_dinode *la_dinode,
                                            struct ocfs2_dinode *tl_dinode,
-                                           struct ocfs2_quota_recovery *qrec);
+                                           struct ocfs2_quota_recovery *qrec,
+                                           enum ocfs2_orphan_reco_type orphan_reco_type);
 
 static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
 {
@@ -149,7 +153,8 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb)
        return 0;
 }
 
-void ocfs2_queue_replay_slots(struct ocfs2_super *osb)
+void ocfs2_queue_replay_slots(struct ocfs2_super *osb,
+               enum ocfs2_orphan_reco_type orphan_reco_type)
 {
        struct ocfs2_replay_map *replay_map = osb->replay_map;
        int i;
@@ -163,7 +168,8 @@ void ocfs2_queue_replay_slots(struct ocfs2_super *osb)
        for (i = 0; i < replay_map->rm_slots; i++)
                if (replay_map->rm_replay_slots[i])
                        ocfs2_queue_recovery_completion(osb->journal, i, NULL,
-                                                       NULL, NULL);
+                                                       NULL, NULL,
+                                                       orphan_reco_type);
        replay_map->rm_state = REPLAY_DONE;
 }
 
@@ -1174,6 +1180,7 @@ struct ocfs2_la_recovery_item {
        struct ocfs2_dinode     *lri_la_dinode;
        struct ocfs2_dinode     *lri_tl_dinode;
        struct ocfs2_quota_recovery *lri_qrec;
+       enum ocfs2_orphan_reco_type  lri_orphan_reco_type;
 };
 
 /* Does the second half of the recovery process. By this point, the
@@ -1195,6 +1202,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
        struct ocfs2_dinode *la_dinode, *tl_dinode;
        struct ocfs2_la_recovery_item *item, *n;
        struct ocfs2_quota_recovery *qrec;
+       enum ocfs2_orphan_reco_type orphan_reco_type;
        LIST_HEAD(tmp_la_list);
 
        trace_ocfs2_complete_recovery(
@@ -1212,6 +1220,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
                la_dinode = item->lri_la_dinode;
                tl_dinode = item->lri_tl_dinode;
                qrec = item->lri_qrec;
+               orphan_reco_type = item->lri_orphan_reco_type;
 
                trace_ocfs2_complete_recovery_slot(item->lri_slot,
                        la_dinode ? le64_to_cpu(la_dinode->i_blkno) : 0,
@@ -1236,7 +1245,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
                        kfree(tl_dinode);
                }
 
-               ret = ocfs2_recover_orphans(osb, item->lri_slot);
+               ret = ocfs2_recover_orphans(osb, item->lri_slot,
+                               orphan_reco_type);
                if (ret < 0)
                        mlog_errno(ret);
 
@@ -1261,7 +1271,8 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
                                            int slot_num,
                                            struct ocfs2_dinode *la_dinode,
                                            struct ocfs2_dinode *tl_dinode,
-                                           struct ocfs2_quota_recovery *qrec)
+                                           struct ocfs2_quota_recovery *qrec,
+                                           enum ocfs2_orphan_reco_type orphan_reco_type)
 {
        struct ocfs2_la_recovery_item *item;
 
@@ -1285,6 +1296,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
        item->lri_slot = slot_num;
        item->lri_tl_dinode = tl_dinode;
        item->lri_qrec = qrec;
+       item->lri_orphan_reco_type = orphan_reco_type;
 
        spin_lock(&journal->j_lock);
        list_add_tail(&item->lri_list, &journal->j_la_cleanups);
@@ -1304,7 +1316,8 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
        /* No need to queue up our truncate_log as regular cleanup will catch
         * that */
        ocfs2_queue_recovery_completion(journal, osb->slot_num,
-                                       osb->local_alloc_copy, NULL, NULL);
+                                       osb->local_alloc_copy, NULL, NULL,
+                                       ORPHAN_NEED_TRUNCATE);
        ocfs2_schedule_truncate_log_flush(osb, 0);
 
        osb->local_alloc_copy = NULL;
@@ -1312,7 +1325,7 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
 
        /* queue to recover orphan slots for all offline slots */
        ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
-       ocfs2_queue_replay_slots(osb);
+       ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE);
        ocfs2_free_replay_slots(osb);
 }
 
@@ -1323,7 +1336,8 @@ void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
                                                osb->slot_num,
                                                NULL,
                                                NULL,
-                                               osb->quota_rec);
+                                               osb->quota_rec,
+                                               ORPHAN_NEED_TRUNCATE);
                osb->quota_rec = NULL;
        }
 }
@@ -1360,7 +1374,7 @@ restart:
 
        /* queue recovery for our own slot */
        ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
-                                       NULL, NULL);
+                                       NULL, NULL, ORPHAN_NO_NEED_TRUNCATE);
 
        spin_lock(&osb->osb_lock);
        while (rm->rm_used) {
@@ -1419,13 +1433,14 @@ skip_recovery:
                        continue;
                }
                ocfs2_queue_recovery_completion(osb->journal, rm_quota[i],
-                                               NULL, NULL, qrec);
+                                               NULL, NULL, qrec,
+                                               ORPHAN_NEED_TRUNCATE);
        }
 
        ocfs2_super_unlock(osb, 1);
 
        /* queue recovery for offline slots */
-       ocfs2_queue_replay_slots(osb);
+       ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE);
 
 bail:
        mutex_lock(&osb->recovery_lock);
@@ -1711,7 +1726,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
 
        /* This will kfree the memory pointed to by la_copy and tl_copy */
        ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
-                                       tl_copy, NULL);
+                                       tl_copy, NULL, ORPHAN_NEED_TRUNCATE);
 
        status = 0;
 done:
@@ -1901,7 +1916,7 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
 
        for (i = 0; i < osb->max_slots; i++)
                ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL,
-                                               NULL);
+                                               NULL, ORPHAN_NO_NEED_TRUNCATE);
        /*
         * We queued a recovery on orphan slots, increment the sequence
         * number and update LVB so other node will skip the scan for a while
@@ -2000,6 +2015,13 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
        if (IS_ERR(iter))
                return 0;
 
+       /* Skip inodes which are already added to recover list, since dio may
+        * happen concurrently with unlink/rename */
+       if (OCFS2_I(iter)->ip_next_orphan) {
+               iput(iter);
+               return 0;
+       }
+
        trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno);
        /* No locking is required for the next_orphan queue as there
         * is only ever a single process doing orphan recovery. */
@@ -2108,7 +2130,8 @@ static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb,
  *   advertising our state to ocfs2_delete_inode().
  */
 static int ocfs2_recover_orphans(struct ocfs2_super *osb,
-                                int slot)
+                                int slot,
+                                enum ocfs2_orphan_reco_type orphan_reco_type)
 {
        int ret = 0;
        struct inode *inode = NULL;
@@ -2132,13 +2155,60 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                                        (unsigned long long)oi->ip_blkno);
 
                iter = oi->ip_next_orphan;
+               oi->ip_next_orphan = NULL;
+
+               /*
+                * We need to take and drop the inode lock to
+                * force read inode from disk.
+                */
+               ret = ocfs2_inode_lock(inode, NULL, 0);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto next;
+               }
+               ocfs2_inode_unlock(inode, 0);
+
+               if (inode->i_nlink == 0) {
+                       spin_lock(&oi->ip_lock);
+                       /* Set the proper information to get us going into
+                        * ocfs2_delete_inode. */
+                       oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
+                       spin_unlock(&oi->ip_lock);
+               } else if (orphan_reco_type == ORPHAN_NEED_TRUNCATE) {
+                       struct buffer_head *di_bh = NULL;
+
+                       ret = ocfs2_rw_lock(inode, 1);
+                       if (ret) {
+                               mlog_errno(ret);
+                               goto next;
+                       }
+
+                       ret = ocfs2_inode_lock(inode, &di_bh, 1);
+                       if (ret < 0) {
+                               ocfs2_rw_unlock(inode, 1);
+                               mlog_errno(ret);
+                               goto next;
+                       }
+
+                       ret = ocfs2_truncate_file(inode, di_bh,
+                                       i_size_read(inode));
+                       ocfs2_inode_unlock(inode, 1);
+                       ocfs2_rw_unlock(inode, 1);
+                       brelse(di_bh);
+                       if (ret < 0) {
+                               if (ret != -ENOSPC)
+                                       mlog_errno(ret);
+                               goto next;
+                       }
+
+                       ret = ocfs2_del_inode_from_orphan(osb, inode, 0, 0);
+                       if (ret)
+                               mlog_errno(ret);
 
-               spin_lock(&oi->ip_lock);
-               /* Set the proper information to get us going into
-                * ocfs2_delete_inode. */
-               oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
-               spin_unlock(&oi->ip_lock);
+                       wake_up(&OCFS2_I(inode)->append_dio_wq);
+               } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */
 
+next:
                iput(inode);
 
                inode = iter;
index 7f8cde94abfe6968069f206562a025eed6e5ff5c..f4cd3c3e9fb70d708d57a3d8dc15f92492e4ea12 100644 (file)
@@ -472,6 +472,11 @@ static inline int ocfs2_unlink_credits(struct super_block *sb)
  * orphan dir index leaf */
 #define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 4)
 
+/* dinode + orphan dir dinode + extent tree leaf block + orphan dir entry +
+ * orphan dir index root + orphan dir index leaf */
+#define OCFS2_INODE_ADD_TO_ORPHAN_CREDITS  (2 * OCFS2_INODE_UPDATE_CREDITS + 4)
+#define OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS  OCFS2_INODE_ADD_TO_ORPHAN_CREDITS
+
 /* dinode update, old dir dinode update, new dir dinode update, old
  * dir dir entry, new dir dir entry, dir entry update for renaming
  * directory + target unlink + 3 x dir index leaves */
index 914c121ec8900380482f83728b90f7b0bd14e418..b5c3a5ea3ee60e264cf7523e8626ac3b61f9421b 100644 (file)
@@ -79,7 +79,8 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
                                    struct inode **ret_orphan_dir,
                                    u64 blkno,
                                    char *name,
-                                   struct ocfs2_dir_lookup_result *lookup);
+                                   struct ocfs2_dir_lookup_result *lookup,
+                                   bool dio);
 
 static int ocfs2_orphan_add(struct ocfs2_super *osb,
                            handle_t *handle,
@@ -87,7 +88,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
                            struct buffer_head *fe_bh,
                            char *name,
                            struct ocfs2_dir_lookup_result *lookup,
-                           struct inode *orphan_dir_inode);
+                           struct inode *orphan_dir_inode,
+                           bool dio);
 
 static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
                                     handle_t *handle,
@@ -104,6 +106,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
 static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2);
 /* An orphan dir name is an 8 byte value, printed as a hex string */
 #define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
+#define OCFS2_DIO_ORPHAN_PREFIX "dio-"
+#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4
 
 static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
                                   unsigned int flags)
@@ -952,7 +956,8 @@ static int ocfs2_unlink(struct inode *dir,
        if (ocfs2_inode_is_unlinkable(inode)) {
                status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
                                                  OCFS2_I(inode)->ip_blkno,
-                                                 orphan_name, &orphan_insert);
+                                                 orphan_name, &orphan_insert,
+                                                 false);
                if (status < 0) {
                        mlog_errno(status);
                        goto leave;
@@ -1004,7 +1009,7 @@ static int ocfs2_unlink(struct inode *dir,
 
        if (is_unlinkable) {
                status = ocfs2_orphan_add(osb, handle, inode, fe_bh,
-                               orphan_name, &orphan_insert, orphan_dir);
+                               orphan_name, &orphan_insert, orphan_dir, false);
                if (status < 0)
                        mlog_errno(status);
        }
@@ -1440,7 +1445,8 @@ static int ocfs2_rename(struct inode *old_dir,
                if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
                        status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
                                                OCFS2_I(new_inode)->ip_blkno,
-                                               orphan_name, &orphan_insert);
+                                               orphan_name, &orphan_insert,
+                                               false);
                        if (status < 0) {
                                mlog_errno(status);
                                goto bail;
@@ -1507,7 +1513,7 @@ static int ocfs2_rename(struct inode *old_dir,
                if (should_add_orphan) {
                        status = ocfs2_orphan_add(osb, handle, new_inode,
                                        newfe_bh, orphan_name,
-                                       &orphan_insert, orphan_dir);
+                                       &orphan_insert, orphan_dir, false);
                        if (status < 0) {
                                mlog_errno(status);
                                goto bail;
@@ -2088,12 +2094,28 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
                                      struct buffer_head *orphan_dir_bh,
                                      u64 blkno,
                                      char *name,
-                                     struct ocfs2_dir_lookup_result *lookup)
+                                     struct ocfs2_dir_lookup_result *lookup,
+                                     bool dio)
 {
        int ret;
        struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb);
+       int namelen = dio ?
+                       (OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) :
+                       OCFS2_ORPHAN_NAMELEN;
+
+       if (dio) {
+               ret = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s",
+                               OCFS2_DIO_ORPHAN_PREFIX);
+               if (ret != OCFS2_DIO_ORPHAN_PREFIX_LEN) {
+                       ret = -EINVAL;
+                       mlog_errno(ret);
+                       return ret;
+               }
 
-       ret = ocfs2_blkno_stringify(blkno, name);
+               ret = ocfs2_blkno_stringify(blkno,
+                               name + OCFS2_DIO_ORPHAN_PREFIX_LEN);
+       } else
+               ret = ocfs2_blkno_stringify(blkno, name);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
@@ -2101,7 +2123,7 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
 
        ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
                                           orphan_dir_bh, name,
-                                          OCFS2_ORPHAN_NAMELEN, lookup);
+                                          namelen, lookup);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
@@ -2128,7 +2150,8 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
                                    struct inode **ret_orphan_dir,
                                    u64 blkno,
                                    char *name,
-                                   struct ocfs2_dir_lookup_result *lookup)
+                                   struct ocfs2_dir_lookup_result *lookup,
+                                   bool dio)
 {
        struct inode *orphan_dir_inode = NULL;
        struct buffer_head *orphan_dir_bh = NULL;
@@ -2142,7 +2165,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
        }
 
        ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh,
-                                        blkno, name, lookup);
+                                        blkno, name, lookup, dio);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -2170,12 +2193,16 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
                            struct buffer_head *fe_bh,
                            char *name,
                            struct ocfs2_dir_lookup_result *lookup,
-                           struct inode *orphan_dir_inode)
+                           struct inode *orphan_dir_inode,
+                           bool dio)
 {
        struct buffer_head *orphan_dir_bh = NULL;
        int status = 0;
        struct ocfs2_dinode *orphan_fe;
        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
+       int namelen = dio ?
+                       (OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) :
+                       OCFS2_ORPHAN_NAMELEN;
 
        trace_ocfs2_orphan_add_begin(
                                (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -2219,7 +2246,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
        ocfs2_journal_dirty(handle, orphan_dir_bh);
 
        status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
-                                  OCFS2_ORPHAN_NAMELEN, inode,
+                                  namelen, inode,
                                   OCFS2_I(inode)->ip_blkno,
                                   orphan_dir_bh, lookup);
        if (status < 0) {
@@ -2227,13 +2254,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
                goto rollback;
        }
 
-       fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL);
-       OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
+       if (dio) {
+               /* Update flag OCFS2_DIO_ORPHANED_FL and record the orphan
+                * slot.
+                */
+               fe->i_flags |= cpu_to_le32(OCFS2_DIO_ORPHANED_FL);
+               fe->i_dio_orphaned_slot = cpu_to_le16(osb->slot_num);
+       } else {
+               fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL);
+               OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
 
-       /* Record which orphan dir our inode now resides
-        * in. delete_inode will use this to determine which orphan
-        * dir to lock. */
-       fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
+               /* Record which orphan dir our inode now resides
+                * in. delete_inode will use this to determine which orphan
+                * dir to lock. */
+               fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
+       }
 
        ocfs2_journal_dirty(handle, fe_bh);
 
@@ -2258,14 +2293,28 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
                     handle_t *handle,
                     struct inode *orphan_dir_inode,
                     struct inode *inode,
-                    struct buffer_head *orphan_dir_bh)
+                    struct buffer_head *orphan_dir_bh,
+                    bool dio)
 {
-       char name[OCFS2_ORPHAN_NAMELEN + 1];
+       const int namelen = OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN;
+       char name[namelen + 1];
        struct ocfs2_dinode *orphan_fe;
        int status = 0;
        struct ocfs2_dir_lookup_result lookup = { NULL, };
 
-       status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
+       if (dio) {
+               status = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s",
+                               OCFS2_DIO_ORPHAN_PREFIX);
+               if (status != OCFS2_DIO_ORPHAN_PREFIX_LEN) {
+                       status = -EINVAL;
+                       mlog_errno(status);
+                       return status;
+               }
+
+               status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno,
+                               name + OCFS2_DIO_ORPHAN_PREFIX_LEN);
+       } else
+               status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -2273,10 +2322,10 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
 
        trace_ocfs2_orphan_del(
             (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
-            name, OCFS2_ORPHAN_NAMELEN);
+            name, namelen);
 
        /* find it's spot in the orphan directory */
-       status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode,
+       status = ocfs2_find_entry(name, namelen, orphan_dir_inode,
                                  &lookup);
        if (status) {
                mlog_errno(status);
@@ -2376,7 +2425,8 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir,
        }
 
        ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh,
-                                        di_blkno, orphan_name, orphan_insert);
+                                        di_blkno, orphan_name, orphan_insert,
+                                        false);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -2482,7 +2532,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
 
        di = (struct ocfs2_dinode *)new_di_bh->b_data;
        status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
-                                 &orphan_insert, orphan_dir);
+                                 &orphan_insert, orphan_dir, false);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -2527,6 +2577,186 @@ leave:
        return status;
 }
 
+static int ocfs2_dio_orphan_recovered(struct inode *inode)
+{
+       int ret;
+       struct buffer_head *di_bh = NULL;
+       struct ocfs2_dinode *di = NULL;
+
+       ret = ocfs2_inode_lock(inode, &di_bh, 1);
+       if (ret < 0) {
+               mlog_errno(ret);
+               return 0;
+       }
+
+       di = (struct ocfs2_dinode *) di_bh->b_data;
+       ret = !(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL));
+       ocfs2_inode_unlock(inode, 1);
+       brelse(di_bh);
+
+       return ret;
+}
+
+#define OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL 10000
+int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
+       struct inode *inode)
+{
+       char orphan_name[OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN + 1];
+       struct inode *orphan_dir_inode = NULL;
+       struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
+       struct buffer_head *di_bh = NULL;
+       int status = 0;
+       handle_t *handle = NULL;
+       struct ocfs2_dinode *di = NULL;
+
+restart:
+       status = ocfs2_inode_lock(inode, &di_bh, 1);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+
+       di = (struct ocfs2_dinode *) di_bh->b_data;
+       /*
+        * Another append dio crashed?
+        * If so, wait for recovery first.
+        */
+       if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
+               ocfs2_inode_unlock(inode, 1);
+               brelse(di_bh);
+               wait_event_interruptible_timeout(OCFS2_I(inode)->append_dio_wq,
+                               ocfs2_dio_orphan_recovered(inode),
+                               msecs_to_jiffies(OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL));
+               goto restart;
+       }
+
+       status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode,
+                       OCFS2_I(inode)->ip_blkno,
+                       orphan_name,
+                       &orphan_insert,
+                       true);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail_unlock_inode;
+       }
+
+       handle = ocfs2_start_trans(osb,
+                       OCFS2_INODE_ADD_TO_ORPHAN_CREDITS);
+       if (IS_ERR(handle)) {
+               status = PTR_ERR(handle);
+               goto bail_unlock_orphan;
+       }
+
+       status = ocfs2_orphan_add(osb, handle, inode, di_bh, orphan_name,
+                       &orphan_insert, orphan_dir_inode, true);
+       if (status)
+               mlog_errno(status);
+
+       ocfs2_commit_trans(osb, handle);
+
+bail_unlock_orphan:
+       ocfs2_inode_unlock(orphan_dir_inode, 1);
+       mutex_unlock(&orphan_dir_inode->i_mutex);
+       iput(orphan_dir_inode);
+
+       ocfs2_free_dir_lookup_result(&orphan_insert);
+
+bail_unlock_inode:
+       ocfs2_inode_unlock(inode, 1);
+       brelse(di_bh);
+
+bail:
+       return status;
+}
+
+int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
+               struct inode *inode, int update_isize,
+               loff_t end)
+{
+       struct inode *orphan_dir_inode = NULL;
+       struct buffer_head *orphan_dir_bh = NULL;
+       struct buffer_head *di_bh = NULL;
+       struct ocfs2_dinode *di = NULL;
+       handle_t *handle = NULL;
+       int status = 0;
+
+       status = ocfs2_inode_lock(inode, &di_bh, 1);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+       di = (struct ocfs2_dinode *) di_bh->b_data;
+
+       orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+                       ORPHAN_DIR_SYSTEM_INODE,
+                       le16_to_cpu(di->i_dio_orphaned_slot));
+       if (!orphan_dir_inode) {
+               status = -ENOENT;
+               mlog_errno(status);
+               goto bail_unlock_inode;
+       }
+
+       mutex_lock(&orphan_dir_inode->i_mutex);
+       status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+       if (status < 0) {
+               mutex_unlock(&orphan_dir_inode->i_mutex);
+               iput(orphan_dir_inode);
+               mlog_errno(status);
+               goto bail_unlock_inode;
+       }
+
+       handle = ocfs2_start_trans(osb,
+                       OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS);
+       if (IS_ERR(handle)) {
+               status = PTR_ERR(handle);
+               goto bail_unlock_orphan;
+       }
+
+       BUG_ON(!(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)));
+
+       status = ocfs2_orphan_del(osb, handle, orphan_dir_inode,
+                               inode, orphan_dir_bh, true);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail_commit;
+       }
+
+       status = ocfs2_journal_access_di(handle,
+                       INODE_CACHE(inode),
+                       di_bh,
+                       OCFS2_JOURNAL_ACCESS_WRITE);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail_commit;
+       }
+
+       di->i_flags &= ~cpu_to_le32(OCFS2_DIO_ORPHANED_FL);
+       di->i_dio_orphaned_slot = 0;
+
+       if (update_isize) {
+               status = ocfs2_set_inode_size(handle, inode, di_bh, end);
+               if (status)
+                       mlog_errno(status);
+       } else
+               ocfs2_journal_dirty(handle, di_bh);
+
+bail_commit:
+       ocfs2_commit_trans(osb, handle);
+
+bail_unlock_orphan:
+       ocfs2_inode_unlock(orphan_dir_inode, 1);
+       mutex_unlock(&orphan_dir_inode->i_mutex);
+       brelse(orphan_dir_bh);
+       iput(orphan_dir_inode);
+
+bail_unlock_inode:
+       ocfs2_inode_unlock(inode, 1);
+       brelse(di_bh);
+
+bail:
+       return status;
+}
+
 int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
                                   struct inode *inode,
                                   struct dentry *dentry)
@@ -2615,7 +2845,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
        }
 
        status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
-                                 orphan_dir_bh);
+                                 orphan_dir_bh, false);
        if (status < 0) {
                mlog_errno(status);
                goto out_commit;
index e5d059d4f11532422f9e3f15fc90da80e373d353..5ddecce172fad738d2d436df76618bda9f19b756 100644 (file)
@@ -34,10 +34,16 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
                     handle_t *handle,
                     struct inode *orphan_dir_inode,
                     struct inode *inode,
-                    struct buffer_head *orphan_dir_bh);
+                    struct buffer_head *orphan_dir_bh,
+                    bool dio);
 int ocfs2_create_inode_in_orphan(struct inode *dir,
                                 int mode,
                                 struct inode **new_inode);
+int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
+               struct inode *inode);
+int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
+               struct inode *inode, int update_isize,
+               loff_t end);
 int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
                                   struct inode *new_inode,
                                   struct dentry *new_dentry);
index fdbcbfed529ee5ff4cca3ef339cee5341db8b892..8490c64d34fef4fd0421c4bb7e7c7ea9f6d87bca 100644 (file)
@@ -209,6 +209,11 @@ struct ocfs2_lock_res {
 #endif
 };
 
+enum ocfs2_orphan_reco_type {
+       ORPHAN_NO_NEED_TRUNCATE = 0,
+       ORPHAN_NEED_TRUNCATE,
+};
+
 enum ocfs2_orphan_scan_state {
        ORPHAN_SCAN_ACTIVE,
        ORPHAN_SCAN_INACTIVE
@@ -495,6 +500,14 @@ static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb)
        return 0;
 }
 
+static inline int ocfs2_supports_append_dio(struct ocfs2_super *osb)
+{
+       if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_APPEND_DIO)
+               return 1;
+       return 0;
+}
+
+
 static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb)
 {
        if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INLINE_DATA)
@@ -726,6 +739,16 @@ static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb,
        return clusters;
 }
 
+static inline unsigned int ocfs2_bytes_to_clusters(struct super_block *sb,
+               u64 bytes)
+{
+       int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
+       unsigned int clusters;
+
+       clusters = (unsigned int)(bytes >> cl_bits);
+       return clusters;
+}
+
 static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb,
                                         u64 bytes)
 {
index 938387a10d5d7c2caa65ee5faeb1649d5fbc578d..20e37a3ed26f3eb721ad45c2699a6332a098cee8 100644 (file)
                                         | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP   (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
                                         | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
-                                        | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
+                                        | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA \
+                                        | OCFS2_FEATURE_RO_COMPAT_APPEND_DIO)
 
 /*
  * Heartbeat-only devices are missing journals and other files.  The
 #define OCFS2_FEATURE_RO_COMPAT_USRQUOTA       0x0002
 #define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA       0x0004
 
+/*
+ * Append Direct IO support
+ */
+#define OCFS2_FEATURE_RO_COMPAT_APPEND_DIO     0x0008
+
 /* The byte offset of the first backup block will be 1G.
  * The following will be 4G, 16G, 64G, 256G and 1T.
  */
 #define OCFS2_CHAIN_FL         (0x00000400)    /* Chain allocator */
 #define OCFS2_DEALLOC_FL       (0x00000800)    /* Truncate log */
 #define OCFS2_QUOTA_FL         (0x00001000)    /* Quota file */
+#define OCFS2_DIO_ORPHANED_FL  (0X00002000)    /* On the orphan list especially
+                                                * for dio */
 
 /*
  * Flags on ocfs2_dinode.i_dyn_features
@@ -729,7 +737,9 @@ struct ocfs2_dinode {
                                           inode belongs to.  Only valid
                                           if allocated from a
                                           discontiguous block group */
-/*A0*/ __le64 i_reserved2[3];
+/*A0*/ __le16 i_dio_orphaned_slot;     /* only used for append dio write */
+       __le16 i_reserved1[3];
+       __le64 i_reserved2[2];
 /*B8*/ union {
                __le64 i_pad1;          /* Generic way to refer to this
                                           64bit union */
index 87a1f7679d9b6fb4e86c6696b27bafa390a7d73c..26675185b88688c1c8b7571eed45cd06f1d70eca 100644 (file)
@@ -1746,6 +1746,8 @@ static void ocfs2_inode_init_once(void *data)
        ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
        ocfs2_lock_res_init_once(&oi->ip_open_lockres);
 
+       init_waitqueue_head(&oi->append_dio_wq);
+
        ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode),
                                  &ocfs2_inode_caching_ops);
 
index 813be037b412907e040fe2b94861cfcbf14fc91f..a293c2020676720e9da43c42d858cc5d73a41674 100644 (file)
--- a/fs/open.c
+++ b/fs/open.c
@@ -667,11 +667,8 @@ int open_check_o_direct(struct file *f)
 {
        /* NB: we're sure to have correct a_ops only after f_op->open */
        if (f->f_flags & O_DIRECT) {
-               if (!f->f_mapping->a_ops ||
-                   ((!f->f_mapping->a_ops->direct_IO) &&
-                   (!f->f_mapping->a_ops->get_xip_mem))) {
+               if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
                        return -EINVAL;
-               }
        }
        return 0;
 }
index e49f10cc8a738340be7f3c89dc473cd8b9c004cd..ed5a0900b94d49989b9eb0fdff8db935fabc66a9 100644 (file)
@@ -51,6 +51,7 @@ struct swap_info_struct;
 struct seq_file;
 struct workqueue_struct;
 struct iov_iter;
+struct vm_fault;
 
 extern void __init inode_init(void);
 extern void __init inode_init_early(void);
@@ -361,8 +362,6 @@ struct address_space_operations {
        int (*releasepage) (struct page *, gfp_t);
        void (*freepage)(struct page *);
        ssize_t (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset);
-       int (*get_xip_mem)(struct address_space *, pgoff_t, int,
-                                               void **, unsigned long *);
        /*
         * migrate the contents of a page to the specified target. If
         * migrate_mode is MIGRATE_ASYNC, it must not block.
@@ -1677,6 +1676,11 @@ struct super_operations {
 #define S_IMA          1024    /* Inode has an associated IMA struct */
 #define S_AUTOMOUNT    2048    /* Automount/referral quasi-directory */
 #define S_NOSEC                4096    /* no suid or xattr security attributes */
+#ifdef CONFIG_FS_DAX
+#define S_DAX          8192    /* Direct Access, avoiding the page cache */
+#else
+#define S_DAX          0       /* Make all the DAX code disappear */
+#endif
 
 /*
  * Note that nosuid etc flags are inode-specific: setting some file-system
@@ -1714,6 +1718,7 @@ struct super_operations {
 #define IS_IMA(inode)          ((inode)->i_flags & S_IMA)
 #define IS_AUTOMOUNT(inode)    ((inode)->i_flags & S_AUTOMOUNT)
 #define IS_NOSEC(inode)                ((inode)->i_flags & S_NOSEC)
+#define IS_DAX(inode)          ((inode)->i_flags & S_DAX)
 
 #define IS_WHITEOUT(inode)     (S_ISCHR(inode->i_mode) && \
                                 (inode)->i_rdev == WHITEOUT_DEV)
@@ -2581,19 +2586,13 @@ extern loff_t fixed_size_llseek(struct file *file, loff_t offset,
 extern int generic_file_open(struct inode * inode, struct file * filp);
 extern int nonseekable_open(struct inode * inode, struct file * filp);
 
-#ifdef CONFIG_FS_XIP
-extern ssize_t xip_file_read(struct file *filp, char __user *buf, size_t len,
-                            loff_t *ppos);
-extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma);
-extern ssize_t xip_file_write(struct file *filp, const char __user *buf,
-                             size_t len, loff_t *ppos);
-extern int xip_truncate_page(struct address_space *mapping, loff_t from);
-#else
-static inline int xip_truncate_page(struct address_space *mapping, loff_t from)
-{
-       return 0;
-}
-#endif
+ssize_t dax_do_io(int rw, struct kiocb *, struct inode *, struct iov_iter *,
+               loff_t, get_block_t, dio_iodone_t, int flags);
+int dax_clear_blocks(struct inode *, sector_t block, long size);
+int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
+int dax_truncate_page(struct inode *, loff_t from, get_block_t);
+int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
+#define dax_mkwrite(vma, vmf, gb)      dax_fault(vma, vmf, gb)
 
 #ifdef CONFIG_BLOCK
 typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
@@ -2750,6 +2749,11 @@ extern int generic_show_options(struct seq_file *m, struct dentry *root);
 extern void save_mount_options(struct super_block *sb, char *options);
 extern void replace_mount_options(struct super_block *sb, char *options);
 
+static inline bool io_is_direct(struct file *filp)
+{
+       return (filp->f_flags & O_DIRECT) || IS_DAX(file_inode(filp));
+}
+
 static inline ino_t parent_ino(struct dentry *dentry)
 {
        ino_t res;
index 9bee7ec0c31f6c8fc328cc3b1ca118ec9be6a601..47a93928b90fff85339fd09d47761549db576b58 100644 (file)
@@ -224,6 +224,7 @@ struct vm_fault {
        pgoff_t pgoff;                  /* Logical page offset based on vma */
        void __user *virtual_address;   /* Faulting virtual address */
 
+       struct page *cow_page;          /* Handler may choose to COW */
        struct page *page;              /* ->fault handlers should return a
                                         * page here, unless VM_FAULT_NOPAGE
                                         * is set (which is also implied by
index b38f559130d5a2962a3e7581a1fd54a6b0fb9f0d..c4c559a45dc852a491d7b0a5d815e5787a11f56e 100644 (file)
@@ -198,7 +198,7 @@ int page_referenced(struct page *, int is_locked,
 int try_to_unmap(struct page *, enum ttu_flags flags);
 
 /*
- * Called from mm/filemap_xip.c to unmap empty zero page
+ * Used by uprobes to replace a userspace page safely
  */
 pte_t *__page_check_address(struct page *, struct mm_struct *,
                                unsigned long, spinlock_t **, int);
diff --git a/include/linux/rtc/ds1685.h b/include/linux/rtc/ds1685.h
new file mode 100644 (file)
index 0000000..e6337a5
--- /dev/null
@@ -0,0 +1,375 @@
+/*
+ * Definitions for the registers, addresses, and platform data of the
+ * DS1685/DS1687-series RTC chips.
+ *
+ * This Driver also works for the DS17X85/DS17X87 RTC chips.  Functionally
+ * similar to the DS1685/DS1687, they support a few extra features which
+ * include larger, battery-backed NV-SRAM, burst-mode access, and an RTC
+ * write counter.
+ *
+ * Copyright (C) 2011-2014 Joshua Kinard <kumba@gentoo.org>.
+ * Copyright (C) 2009 Matthias Fuchs <matthias.fuchs@esd-electronics.com>.
+ *
+ * References:
+ *    DS1685/DS1687 3V/5V Real-Time Clocks, 19-5215, Rev 4/10.
+ *    DS17x85/DS17x87 3V/5V Real-Time Clocks, 19-5222, Rev 4/10.
+ *    DS1689/DS1693 3V/5V Serialized Real-Time Clocks, Rev 112105.
+ *    Application Note 90, Using the Multiplex Bus RTC Extended Features.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _LINUX_RTC_DS1685_H_
+#define _LINUX_RTC_DS1685_H_
+
+#include <linux/rtc.h>
+#include <linux/platform_device.h>
+#include <linux/workqueue.h>
+
+/**
+ * struct ds1685_priv - DS1685 private data structure.
+ * @dev: pointer to the rtc_device structure.
+ * @regs: iomapped base address pointer of the RTC registers.
+ * @regstep: padding/step size between registers (optional).
+ * @baseaddr: base address of the RTC device.
+ * @size: resource size.
+ * @lock: private lock variable for spin locking/unlocking.
+ * @work: private workqueue.
+ * @irq: IRQ number assigned to the RTC device.
+ * @prepare_poweroff: pointer to platform pre-poweroff function.
+ * @wake_alarm: pointer to platform wake alarm function.
+ * @post_ram_clear: pointer to platform post ram-clear function.
+ */
+struct ds1685_priv {
+       struct rtc_device *dev;
+       void __iomem *regs;
+       u32 regstep;
+       resource_size_t baseaddr;
+       size_t size;
+       spinlock_t lock;
+       struct work_struct work;
+       int irq_num;
+       bool bcd_mode;
+       bool no_irq;
+       bool uie_unsupported;
+       bool alloc_io_resources;
+       u8 (*read)(struct ds1685_priv *, int);
+       void (*write)(struct ds1685_priv *, int, u8);
+       void (*prepare_poweroff)(void);
+       void (*wake_alarm)(void);
+       void (*post_ram_clear)(void);
+};
+
+
+/**
+ * struct ds1685_rtc_platform_data - platform data structure.
+ * @plat_prepare_poweroff: platform-specific pre-poweroff function.
+ * @plat_wake_alarm: platform-specific wake alarm function.
+ * @plat_post_ram_clear: platform-specific post ram-clear function.
+ *
+ * If your platform needs to use a custom padding/step size between
+ * registers, or uses one or more of the extended interrupts and needs special
+ * handling, then include this header file in your platform definition and
+ * set regstep and the plat_* pointers as appropriate.
+ */
+struct ds1685_rtc_platform_data {
+       const u32 regstep;
+       const bool bcd_mode;
+       const bool no_irq;
+       const bool uie_unsupported;
+       const bool alloc_io_resources;
+       u8 (*plat_read)(struct ds1685_priv *, int);
+       void (*plat_write)(struct ds1685_priv *, int, u8);
+       void (*plat_prepare_poweroff)(void);
+       void (*plat_wake_alarm)(void);
+       void (*plat_post_ram_clear)(void);
+};
+
+
+/*
+ * Time Registers.
+ */
+#define RTC_SECS               0x00    /* Seconds 00-59 */
+#define RTC_SECS_ALARM         0x01    /* Alarm Seconds 00-59 */
+#define RTC_MINS               0x02    /* Minutes 00-59 */
+#define RTC_MINS_ALARM         0x03    /* Alarm Minutes 00-59 */
+#define RTC_HRS                        0x04    /* Hours 01-12 AM/PM || 00-23 */
+#define RTC_HRS_ALARM          0x05    /* Alarm Hours 01-12 AM/PM || 00-23 */
+#define RTC_WDAY               0x06    /* Day of Week 01-07 */
+#define RTC_MDAY               0x07    /* Day of Month 01-31 */
+#define RTC_MONTH              0x08    /* Month 01-12 */
+#define RTC_YEAR               0x09    /* Year 00-99 */
+#define RTC_CENTURY            0x48    /* Century 00-99 */
+#define RTC_MDAY_ALARM         0x49    /* Alarm Day of Month 01-31 */
+
+
+/*
+ * Bit masks for the Time registers in BCD Mode (DM = 0).
+ */
+#define RTC_SECS_BCD_MASK      0x7f    /* - x x x x x x x */
+#define RTC_MINS_BCD_MASK      0x7f    /* - x x x x x x x */
+#define RTC_HRS_12_BCD_MASK    0x1f    /* - - - x x x x x */
+#define RTC_HRS_24_BCD_MASK    0x3f    /* - - x x x x x x */
+#define RTC_MDAY_BCD_MASK      0x3f    /* - - x x x x x x */
+#define RTC_MONTH_BCD_MASK     0x1f    /* - - - x x x x x */
+#define RTC_YEAR_BCD_MASK      0xff    /* x x x x x x x x */
+
+/*
+ * Bit masks for the Time registers in BIN Mode (DM = 1).
+ */
+#define RTC_SECS_BIN_MASK      0x3f    /* - - x x x x x x */
+#define RTC_MINS_BIN_MASK      0x3f    /* - - x x x x x x */
+#define RTC_HRS_12_BIN_MASK    0x0f    /* - - - - x x x x */
+#define RTC_HRS_24_BIN_MASK    0x1f    /* - - - x x x x x */
+#define RTC_MDAY_BIN_MASK      0x1f    /* - - - x x x x x */
+#define RTC_MONTH_BIN_MASK     0x0f    /* - - - - x x x x */
+#define RTC_YEAR_BIN_MASK      0x7f    /* - x x x x x x x */
+
+/*
+ * Bit masks common for the Time registers in BCD or BIN Mode.
+ */
+#define RTC_WDAY_MASK          0x07    /* - - - - - x x x */
+#define RTC_CENTURY_MASK       0xff    /* x x x x x x x x */
+#define RTC_MDAY_ALARM_MASK    0xff    /* x x x x x x x x */
+#define RTC_HRS_AMPM_MASK      BIT(7)  /* Mask for the AM/PM bit */
+
+
+
+/*
+ * Control Registers.
+ */
+#define RTC_CTRL_A             0x0a    /* Control Register A */
+#define RTC_CTRL_B             0x0b    /* Control Register B */
+#define RTC_CTRL_C             0x0c    /* Control Register C */
+#define RTC_CTRL_D             0x0d    /* Control Register D */
+#define RTC_EXT_CTRL_4A                0x4a    /* Extended Control Register 4A */
+#define RTC_EXT_CTRL_4B                0x4b    /* Extended Control Register 4B */
+
+
+/*
+ * Bit names in Control Register A.
+ */
+#define RTC_CTRL_A_UIP         BIT(7)  /* Update In Progress */
+#define RTC_CTRL_A_DV2         BIT(6)  /* Countdown Chain */
+#define RTC_CTRL_A_DV1         BIT(5)  /* Oscillator Enable */
+#define RTC_CTRL_A_DV0         BIT(4)  /* Bank Select */
+#define RTC_CTRL_A_RS2         BIT(2)  /* Rate-Selection Bit 2 */
+#define RTC_CTRL_A_RS3         BIT(3)  /* Rate-Selection Bit 3 */
+#define RTC_CTRL_A_RS1         BIT(1)  /* Rate-Selection Bit 1 */
+#define RTC_CTRL_A_RS0         BIT(0)  /* Rate-Selection Bit 0 */
+#define RTC_CTRL_A_RS_MASK     0x0f    /* RS3 + RS2 + RS1 + RS0 */
+
+/*
+ * Bit names in Control Register B.
+ */
+#define RTC_CTRL_B_SET         BIT(7)  /* SET Bit */
+#define RTC_CTRL_B_PIE         BIT(6)  /* Periodic-Interrupt Enable */
+#define RTC_CTRL_B_AIE         BIT(5)  /* Alarm-Interrupt Enable */
+#define RTC_CTRL_B_UIE         BIT(4)  /* Update-Ended Interrupt-Enable */
+#define RTC_CTRL_B_SQWE                BIT(3)  /* Square-Wave Enable */
+#define RTC_CTRL_B_DM          BIT(2)  /* Data Mode */
+#define RTC_CTRL_B_2412                BIT(1)  /* 12-Hr/24-Hr Mode */
+#define RTC_CTRL_B_DSE         BIT(0)  /* Daylight Savings Enable */
+#define RTC_CTRL_B_PAU_MASK    0x70    /* PIE + AIE + UIE */
+
+
+/*
+ * Bit names in Control Register C.
+ *
+ * BIT(0), BIT(1), BIT(2), & BIT(3) are unused, always return 0, and cannot
+ * be written to.
+ */
+#define RTC_CTRL_C_IRQF                BIT(7)  /* Interrupt-Request Flag */
+#define RTC_CTRL_C_PF          BIT(6)  /* Periodic-Interrupt Flag */
+#define RTC_CTRL_C_AF          BIT(5)  /* Alarm-Interrupt Flag */
+#define RTC_CTRL_C_UF          BIT(4)  /* Update-Ended Interrupt Flag */
+#define RTC_CTRL_C_PAU_MASK    0x70    /* PF + AF + UF */
+
+
+/*
+ * Bit names in Control Register D.
+ *
+ * BIT(0) through BIT(6) are unused, always return 0, and cannot
+ * be written to.
+ */
+#define RTC_CTRL_D_VRT         BIT(7)  /* Valid RAM and Time */
+
+
+/*
+ * Bit names in Extended Control Register 4A.
+ *
+ * On the DS1685/DS1687/DS1689/DS1693, BIT(4) and BIT(5) are reserved for
+ * future use.  They can be read from and written to, but have no effect
+ * on the RTC's operation.
+ *
+ * On the DS17x85/DS17x87, BIT(5) is Burst-Mode Enable (BME), and allows
+ * access to the extended NV-SRAM by automatically incrementing the address
+ * register when they are read from or written to.
+ */
+#define RTC_CTRL_4A_VRT2       BIT(7)  /* Auxillary Battery Status */
+#define RTC_CTRL_4A_INCR       BIT(6)  /* Increment-in-Progress Status */
+#define RTC_CTRL_4A_PAB                BIT(3)  /* Power-Active Bar Control */
+#define RTC_CTRL_4A_RF         BIT(2)  /* RAM-Clear Flag */
+#define RTC_CTRL_4A_WF         BIT(1)  /* Wake-Up Alarm Flag */
+#define RTC_CTRL_4A_KF         BIT(0)  /* Kickstart Flag */
+#if !defined(CONFIG_RTC_DRV_DS1685) && !defined(CONFIG_RTC_DRV_DS1689)
+#define RTC_CTRL_4A_BME                BIT(5)  /* Burst-Mode Enable */
+#endif
+#define RTC_CTRL_4A_RWK_MASK   0x07    /* RF + WF + KF */
+
+
+/*
+ * Bit names in Extended Control Register 4B.
+ */
+#define RTC_CTRL_4B_ABE                BIT(7)  /* Auxillary Battery Enable */
+#define RTC_CTRL_4B_E32K       BIT(6)  /* Enable 32.768Hz on SQW Pin */
+#define RTC_CTRL_4B_CS         BIT(5)  /* Crystal Select */
+#define RTC_CTRL_4B_RCE                BIT(4)  /* RAM Clear-Enable */
+#define RTC_CTRL_4B_PRS                BIT(3)  /* PAB Reset-Select */
+#define RTC_CTRL_4B_RIE                BIT(2)  /* RAM Clear-Interrupt Enable */
+#define RTC_CTRL_4B_WIE                BIT(1)  /* Wake-Up Alarm-Interrupt Enable */
+#define RTC_CTRL_4B_KSE                BIT(0)  /* Kickstart Interrupt-Enable */
+#define RTC_CTRL_4B_RWK_MASK   0x07    /* RIE + WIE + KSE */
+
+
+/*
+ * Misc register names in Bank 1.
+ *
+ * The DV0 bit in Control Register A must be set to 1 for these registers
+ * to become available, including Extended Control Registers 4A & 4B.
+ */
+#define RTC_BANK1_SSN_MODEL    0x40    /* Model Number */
+#define RTC_BANK1_SSN_BYTE_1   0x41    /* 1st Byte of Serial Number */
+#define RTC_BANK1_SSN_BYTE_2   0x42    /* 2nd Byte of Serial Number */
+#define RTC_BANK1_SSN_BYTE_3   0x43    /* 3rd Byte of Serial Number */
+#define RTC_BANK1_SSN_BYTE_4   0x44    /* 4th Byte of Serial Number */
+#define RTC_BANK1_SSN_BYTE_5   0x45    /* 5th Byte of Serial Number */
+#define RTC_BANK1_SSN_BYTE_6   0x46    /* 6th Byte of Serial Number */
+#define RTC_BANK1_SSN_CRC      0x47    /* Serial CRC Byte */
+#define RTC_BANK1_RAM_DATA_PORT        0x53    /* Extended RAM Data Port */
+
+
+/*
+ * Model-specific registers in Bank 1.
+ *
+ * The addresses below differ depending on the model of the RTC chip
+ * selected in the kernel configuration.  Not all of these features are
+ * supported in the main driver at present.
+ *
+ * DS1685/DS1687   - Extended NV-SRAM address (LSB only).
+ * DS1689/DS1693   - Vcc, Vbat, Pwr Cycle Counters & Customer-specific S/N.
+ * DS17x85/DS17x87 - Extended NV-SRAM addresses (MSB & LSB) & Write counter.
+ */
+#if defined(CONFIG_RTC_DRV_DS1685)
+#define RTC_BANK1_RAM_ADDR     0x50    /* NV-SRAM Addr */
+#elif defined(CONFIG_RTC_DRV_DS1689)
+#define RTC_BANK1_VCC_CTR_LSB  0x54    /* Vcc Counter Addr (LSB) */
+#define RTC_BANK1_VCC_CTR_MSB  0x57    /* Vcc Counter Addr (MSB) */
+#define RTC_BANK1_VBAT_CTR_LSB 0x58    /* Vbat Counter Addr (LSB) */
+#define RTC_BANK1_VBAT_CTR_MSB 0x5b    /* Vbat Counter Addr (MSB) */
+#define RTC_BANK1_PWR_CTR_LSB  0x5c    /* Pwr Cycle Counter Addr (LSB) */
+#define RTC_BANK1_PWR_CTR_MSB  0x5d    /* Pwr Cycle Counter Addr (MSB) */
+#define RTC_BANK1_UNIQ_SN      0x60    /* Customer-specific S/N */
+#else /* DS17x85/DS17x87 */
+#define RTC_BANK1_RAM_ADDR_LSB 0x50    /* NV-SRAM Addr (LSB) */
+#define RTC_BANK1_RAM_ADDR_MSB 0x51    /* NV-SRAM Addr (MSB) */
+#define RTC_BANK1_WRITE_CTR    0x5e    /* RTC Write Counter */
+#endif
+
+
+/*
+ * Model numbers.
+ *
+ * The DS1688/DS1691 and DS1689/DS1693 chips share the same model number
+ * and the manual doesn't indicate any major differences.  As such, they
+ * are regarded as the same chip in this driver.
+ */
+#define RTC_MODEL_DS1685       0x71    /* DS1685/DS1687 */
+#define RTC_MODEL_DS17285      0x72    /* DS17285/DS17287 */
+#define RTC_MODEL_DS1689       0x73    /* DS1688/DS1691/DS1689/DS1693 */
+#define RTC_MODEL_DS17485      0x74    /* DS17485/DS17487 */
+#define RTC_MODEL_DS17885      0x78    /* DS17885/DS17887 */
+
+
+/*
+ * Periodic Interrupt Rates / Square-Wave Output Frequency
+ *
+ * Periodic rates are selected by setting the RS3-RS0 bits in Control
+ * Register A and enabled via either the E32K bit in Extended Control
+ * Register 4B or the SQWE bit in Control Register B.
+ *
+ * E32K overrides the settings of RS3-RS0 and outputs a frequency of 32768Hz
+ * on the SQW pin of the RTC chip.  While there are 16 possible selections,
+ * the 1-of-16 decoder is only able to divide the base 32768Hz signal into 13
+ * smaller frequencies.  The values 0x01 and 0x02 are not used and are
+ * synonymous with 0x08 and 0x09, respectively.
+ *
+ * When E32K is set to a logic 1, periodic interrupts are disabled and reading
+ * /dev/rtc will return -EINVAL.  This also applies if the periodic interrupt
+ * frequency is set to 0Hz.
+ *
+ * Not currently used by the rtc-ds1685 driver because the RTC core removed
+ * support for hardware-generated periodic-interrupts in favour of
+ * hrtimer-generated interrupts.  But these defines are kept around for use
+ * in userland, as documentation to the hardware, and possible future use if
+ * hardware-generated periodic interrupts are ever added back.
+ */
+                                       /* E32K RS3 RS2 RS1 RS0 */
+#define RTC_SQW_8192HZ         0x03    /*  0    0   0   1   1  */
+#define RTC_SQW_4096HZ         0x04    /*  0    0   1   0   0  */
+#define RTC_SQW_2048HZ         0x05    /*  0    0   1   0   1  */
+#define RTC_SQW_1024HZ         0x06    /*  0    0   1   1   0  */
+#define RTC_SQW_512HZ          0x07    /*  0    0   1   1   1  */
+#define RTC_SQW_256HZ          0x08    /*  0    1   0   0   0  */
+#define RTC_SQW_128HZ          0x09    /*  0    1   0   0   1  */
+#define RTC_SQW_64HZ           0x0a    /*  0    1   0   1   0  */
+#define RTC_SQW_32HZ           0x0b    /*  0    1   0   1   1  */
+#define RTC_SQW_16HZ           0x0c    /*  0    1   1   0   0  */
+#define RTC_SQW_8HZ            0x0d    /*  0    1   1   0   1  */
+#define RTC_SQW_4HZ            0x0e    /*  0    1   1   1   0  */
+#define RTC_SQW_2HZ            0x0f    /*  0    1   1   1   1  */
+#define RTC_SQW_0HZ            0x00    /*  0    0   0   0   0  */
+#define RTC_SQW_32768HZ                32768   /*  1    -   -   -   -  */
+#define RTC_MAX_USER_FREQ      8192
+
+
+/*
+ * NVRAM data & addresses:
+ *   - 50 bytes of NVRAM are available just past the clock registers.
+ *   - 64 additional bytes are available in Bank0.
+ *
+ * Extended, battery-backed NV-SRAM:
+ *   - DS1685/DS1687    - 128 bytes.
+ *   - DS1689/DS1693    - 0 bytes.
+ *   - DS17285/DS17287  - 2048 bytes.
+ *   - DS17485/DS17487  - 4096 bytes.
+ *   - DS17885/DS17887  - 8192 bytes.
+ */
+#define NVRAM_TIME_BASE                0x0e    /* NVRAM Addr in Time regs */
+#define NVRAM_BANK0_BASE       0x40    /* NVRAM Addr in Bank0 regs */
+#define NVRAM_SZ_TIME          50
+#define NVRAM_SZ_BANK0         64
+#if defined(CONFIG_RTC_DRV_DS1685)
+#  define NVRAM_SZ_EXTND       128
+#elif defined(CONFIG_RTC_DRV_DS1689)
+#  define NVRAM_SZ_EXTND       0
+#elif defined(CONFIG_RTC_DRV_DS17285)
+#  define NVRAM_SZ_EXTND       2048
+#elif defined(CONFIG_RTC_DRV_DS17485)
+#  define NVRAM_SZ_EXTND       4096
+#elif defined(CONFIG_RTC_DRV_DS17885)
+#  define NVRAM_SZ_EXTND       8192
+#endif
+#define NVRAM_TOTAL_SZ_BANK0   (NVRAM_SZ_TIME + NVRAM_SZ_BANK0)
+#define NVRAM_TOTAL_SZ         (NVRAM_TOTAL_SZ_BANK0 + NVRAM_SZ_EXTND)
+
+
+/*
+ * Function Prototypes.
+ */
+extern void __noreturn
+ds1685_rtc_poweroff(struct platform_device *pdev);
+
+#endif /* _LINUX_RTC_DS1685_H_ */
index cd177caf38761a2e7301d1ab71cc2fe62df98d6f..cb9758e0ba0cd42d43cb8e7d6f86810006b80265 100644 (file)
@@ -14,7 +14,7 @@ config BITREVERSE
        tristate
 
 config HAVE_ARCH_BITREVERSE
-       boolean
+       bool
        default n
        depends on BITREVERSE
        help
index 088c68e9ec3500c667c6eecc6c37b558aa71ba55..3c1caa2693bd22bad68864896c5e02737aac31c5 100644 (file)
@@ -55,7 +55,6 @@ obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
 obj-$(CONFIG_KASAN)    += kasan/
 obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
-obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
index fac23ecf8d7229eb0de2a2292f44e4cfcbe88c6f..4a3907cf79f89ac934331006b05636da20d8f319 100644 (file)
@@ -28,6 +28,7 @@
 SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 {
        struct fd f = fdget(fd);
+       struct inode *inode;
        struct address_space *mapping;
        struct backing_dev_info *bdi;
        loff_t endbyte;                 /* inclusive */
@@ -39,7 +40,8 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
        if (!f.file)
                return -EBADF;
 
-       if (S_ISFIFO(file_inode(f.file)->i_mode)) {
+       inode = file_inode(f.file);
+       if (S_ISFIFO(inode->i_mode)) {
                ret = -ESPIPE;
                goto out;
        }
@@ -50,7 +52,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
                goto out;
        }
 
-       if (mapping->a_ops->get_xip_mem) {
+       if (IS_DAX(inode)) {
                switch (advice) {
                case POSIX_FADV_NORMAL:
                case POSIX_FADV_RANDOM:
index d9f5336552d7b12cad62315a2c512a9ca922bf55..ad7242043bdb8b74872e536b61d01ca05a1de6b3 100644 (file)
@@ -1695,8 +1695,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
        loff_t *ppos = &iocb->ki_pos;
        loff_t pos = *ppos;
 
-       /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
-       if (file->f_flags & O_DIRECT) {
+       if (io_is_direct(file)) {
                struct address_space *mapping = file->f_mapping;
                struct inode *inode = mapping->host;
                size_t count = iov_iter_count(iter);
@@ -1723,9 +1722,11 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
                 * we've already read everything we wanted to, or if
                 * there was a short read because we hit EOF, go ahead
                 * and return.  Otherwise fallthrough to buffered io for
-                * the rest of the read.
+                * the rest of the read.  Buffered reads will not work for
+                * DAX files, so don't bother trying.
                 */
-               if (retval < 0 || !iov_iter_count(iter) || *ppos >= size) {
+               if (retval < 0 || !iov_iter_count(iter) || *ppos >= size ||
+                   IS_DAX(inode)) {
                        file_accessed(file);
                        goto out;
                }
@@ -2582,18 +2583,20 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
        if (err)
                goto out;
 
-       /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
-       if (unlikely(file->f_flags & O_DIRECT)) {
+       if (io_is_direct(file)) {
                loff_t endbyte;
 
                written = generic_file_direct_write(iocb, from, pos);
-               if (written < 0 || written == count)
-                       goto out;
-
                /*
-                * direct-io write to a hole: fall through to buffered I/O
-                * for completing the rest of the request.
+                * If the write stopped short of completing, fall back to
+                * buffered writes.  Some filesystems do this for writes to
+                * holes, for example.  For DAX files, a buffered write will
+                * not succeed (even if it did, DAX does not handle dirty
+                * page-cache pages correctly).
                 */
+               if (written < 0 || written == count || IS_DAX(inode))
+                       goto out;
+
                pos += written;
                count -= written;
 
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
deleted file mode 100644 (file)
index c175f9f..0000000
+++ /dev/null
@@ -1,478 +0,0 @@
-/*
- *     linux/mm/filemap_xip.c
- *
- * Copyright (C) 2005 IBM Corporation
- * Author: Carsten Otte <cotte@de.ibm.com>
- *
- * derived from linux/mm/filemap.c - Copyright (C) Linus Torvalds
- *
- */
-
-#include <linux/fs.h>
-#include <linux/backing-dev.h>
-#include <linux/pagemap.h>
-#include <linux/export.h>
-#include <linux/uio.h>
-#include <linux/rmap.h>
-#include <linux/mmu_notifier.h>
-#include <linux/sched.h>
-#include <linux/seqlock.h>
-#include <linux/mutex.h>
-#include <linux/gfp.h>
-#include <asm/tlbflush.h>
-#include <asm/io.h>
-
-/*
- * We do use our own empty page to avoid interference with other users
- * of ZERO_PAGE(), such as /dev/zero
- */
-static DEFINE_MUTEX(xip_sparse_mutex);
-static seqcount_t xip_sparse_seq = SEQCNT_ZERO(xip_sparse_seq);
-static struct page *__xip_sparse_page;
-
-/* called under xip_sparse_mutex */
-static struct page *xip_sparse_page(void)
-{
-       if (!__xip_sparse_page) {
-               struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
-
-               if (page)
-                       __xip_sparse_page = page;
-       }
-       return __xip_sparse_page;
-}
-
-/*
- * This is a file read routine for execute in place files, and uses
- * the mapping->a_ops->get_xip_mem() function for the actual low-level
- * stuff.
- *
- * Note the struct file* is not used at all.  It may be NULL.
- */
-static ssize_t
-do_xip_mapping_read(struct address_space *mapping,
-                   struct file_ra_state *_ra,
-                   struct file *filp,
-                   char __user *buf,
-                   size_t len,
-                   loff_t *ppos)
-{
-       struct inode *inode = mapping->host;
-       pgoff_t index, end_index;
-       unsigned long offset;
-       loff_t isize, pos;
-       size_t copied = 0, error = 0;
-
-       BUG_ON(!mapping->a_ops->get_xip_mem);
-
-       pos = *ppos;
-       index = pos >> PAGE_CACHE_SHIFT;
-       offset = pos & ~PAGE_CACHE_MASK;
-
-       isize = i_size_read(inode);
-       if (!isize)
-               goto out;
-
-       end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
-       do {
-               unsigned long nr, left;
-               void *xip_mem;
-               unsigned long xip_pfn;
-               int zero = 0;
-
-               /* nr is the maximum number of bytes to copy from this page */
-               nr = PAGE_CACHE_SIZE;
-               if (index >= end_index) {
-                       if (index > end_index)
-                               goto out;
-                       nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
-                       if (nr <= offset) {
-                               goto out;
-                       }
-               }
-               nr = nr - offset;
-               if (nr > len - copied)
-                       nr = len - copied;
-
-               error = mapping->a_ops->get_xip_mem(mapping, index, 0,
-                                                       &xip_mem, &xip_pfn);
-               if (unlikely(error)) {
-                       if (error == -ENODATA) {
-                               /* sparse */
-                               zero = 1;
-                       } else
-                               goto out;
-               }
-
-               /* If users can be writing to this page using arbitrary
-                * virtual addresses, take care about potential aliasing
-                * before reading the page on the kernel side.
-                */
-               if (mapping_writably_mapped(mapping))
-                       /* address based flush */ ;
-
-               /*
-                * Ok, we have the mem, so now we can copy it to user space...
-                *
-                * The actor routine returns how many bytes were actually used..
-                * NOTE! This may not be the same as how much of a user buffer
-                * we filled up (we may be padding etc), so we can only update
-                * "pos" here (the actor routine has to update the user buffer
-                * pointers and the remaining count).
-                */
-               if (!zero)
-                       left = __copy_to_user(buf+copied, xip_mem+offset, nr);
-               else
-                       left = __clear_user(buf + copied, nr);
-
-               if (left) {
-                       error = -EFAULT;
-                       goto out;
-               }
-
-               copied += (nr - left);
-               offset += (nr - left);
-               index += offset >> PAGE_CACHE_SHIFT;
-               offset &= ~PAGE_CACHE_MASK;
-       } while (copied < len);
-
-out:
-       *ppos = pos + copied;
-       if (filp)
-               file_accessed(filp);
-
-       return (copied ? copied : error);
-}
-
-ssize_t
-xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
-{
-       if (!access_ok(VERIFY_WRITE, buf, len))
-               return -EFAULT;
-
-       return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
-                           buf, len, ppos);
-}
-EXPORT_SYMBOL_GPL(xip_file_read);
-
-/*
- * __xip_unmap is invoked from xip_unmap and xip_write
- *
- * This function walks all vmas of the address_space and unmaps the
- * __xip_sparse_page when found at pgoff.
- */
-static void __xip_unmap(struct address_space * mapping, unsigned long pgoff)
-{
-       struct vm_area_struct *vma;
-       struct page *page;
-       unsigned count;
-       int locked = 0;
-
-       count = read_seqcount_begin(&xip_sparse_seq);
-
-       page = __xip_sparse_page;
-       if (!page)
-               return;
-
-retry:
-       i_mmap_lock_read(mapping);
-       vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
-               pte_t *pte, pteval;
-               spinlock_t *ptl;
-               struct mm_struct *mm = vma->vm_mm;
-               unsigned long address = vma->vm_start +
-                       ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-
-               BUG_ON(address < vma->vm_start || address >= vma->vm_end);
-               pte = page_check_address(page, mm, address, &ptl, 1);
-               if (pte) {
-                       /* Nuke the page table entry. */
-                       flush_cache_page(vma, address, pte_pfn(*pte));
-                       pteval = ptep_clear_flush(vma, address, pte);
-                       page_remove_rmap(page);
-                       dec_mm_counter(mm, MM_FILEPAGES);
-                       BUG_ON(pte_dirty(pteval));
-                       pte_unmap_unlock(pte, ptl);
-                       /* must invalidate_page _before_ freeing the page */
-                       mmu_notifier_invalidate_page(mm, address);
-                       page_cache_release(page);
-               }
-       }
-       i_mmap_unlock_read(mapping);
-
-       if (locked) {
-               mutex_unlock(&xip_sparse_mutex);
-       } else if (read_seqcount_retry(&xip_sparse_seq, count)) {
-               mutex_lock(&xip_sparse_mutex);
-               locked = 1;
-               goto retry;
-       }
-}
-
-/*
- * xip_fault() is invoked via the vma operations vector for a
- * mapped memory region to read in file data during a page fault.
- *
- * This function is derived from filemap_fault, but used for execute in place
- */
-static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-       struct file *file = vma->vm_file;
-       struct address_space *mapping = file->f_mapping;
-       struct inode *inode = mapping->host;
-       pgoff_t size;
-       void *xip_mem;
-       unsigned long xip_pfn;
-       struct page *page;
-       int error;
-
-       /* XXX: are VM_FAULT_ codes OK? */
-again:
-       size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-       if (vmf->pgoff >= size)
-               return VM_FAULT_SIGBUS;
-
-       error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
-                                               &xip_mem, &xip_pfn);
-       if (likely(!error))
-               goto found;
-       if (error != -ENODATA)
-               return VM_FAULT_OOM;
-
-       /* sparse block */
-       if ((vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
-           (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) &&
-           (!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
-               int err;
-
-               /* maybe shared writable, allocate new block */
-               mutex_lock(&xip_sparse_mutex);
-               error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1,
-                                                       &xip_mem, &xip_pfn);
-               mutex_unlock(&xip_sparse_mutex);
-               if (error)
-                       return VM_FAULT_SIGBUS;
-               /* unmap sparse mappings at pgoff from all other vmas */
-               __xip_unmap(mapping, vmf->pgoff);
-
-found:
-               err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
-                                                       xip_pfn);
-               if (err == -ENOMEM)
-                       return VM_FAULT_OOM;
-               /*
-                * err == -EBUSY is fine, we've raced against another thread
-                * that faulted-in the same page
-                */
-               if (err != -EBUSY)
-                       BUG_ON(err);
-               return VM_FAULT_NOPAGE;
-       } else {
-               int err, ret = VM_FAULT_OOM;
-
-               mutex_lock(&xip_sparse_mutex);
-               write_seqcount_begin(&xip_sparse_seq);
-               error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
-                                                       &xip_mem, &xip_pfn);
-               if (unlikely(!error)) {
-                       write_seqcount_end(&xip_sparse_seq);
-                       mutex_unlock(&xip_sparse_mutex);
-                       goto again;
-               }
-               if (error != -ENODATA)
-                       goto out;
-               /* not shared and writable, use xip_sparse_page() */
-               page = xip_sparse_page();
-               if (!page)
-                       goto out;
-               err = vm_insert_page(vma, (unsigned long)vmf->virtual_address,
-                                                       page);
-               if (err == -ENOMEM)
-                       goto out;
-
-               ret = VM_FAULT_NOPAGE;
-out:
-               write_seqcount_end(&xip_sparse_seq);
-               mutex_unlock(&xip_sparse_mutex);
-
-               return ret;
-       }
-}
-
-static const struct vm_operations_struct xip_file_vm_ops = {
-       .fault  = xip_file_fault,
-       .page_mkwrite   = filemap_page_mkwrite,
-};
-
-int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
-{
-       BUG_ON(!file->f_mapping->a_ops->get_xip_mem);
-
-       file_accessed(file);
-       vma->vm_ops = &xip_file_vm_ops;
-       vma->vm_flags |= VM_MIXEDMAP;
-       return 0;
-}
-EXPORT_SYMBOL_GPL(xip_file_mmap);
-
-static ssize_t
-__xip_file_write(struct file *filp, const char __user *buf,
-                 size_t count, loff_t pos, loff_t *ppos)
-{
-       struct address_space * mapping = filp->f_mapping;
-       const struct address_space_operations *a_ops = mapping->a_ops;
-       struct inode    *inode = mapping->host;
-       long            status = 0;
-       size_t          bytes;
-       ssize_t         written = 0;
-
-       BUG_ON(!mapping->a_ops->get_xip_mem);
-
-       do {
-               unsigned long index;
-               unsigned long offset;
-               size_t copied;
-               void *xip_mem;
-               unsigned long xip_pfn;
-
-               offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
-               index = pos >> PAGE_CACHE_SHIFT;
-               bytes = PAGE_CACHE_SIZE - offset;
-               if (bytes > count)
-                       bytes = count;
-
-               status = a_ops->get_xip_mem(mapping, index, 0,
-                                               &xip_mem, &xip_pfn);
-               if (status == -ENODATA) {
-                       /* we allocate a new page unmap it */
-                       mutex_lock(&xip_sparse_mutex);
-                       status = a_ops->get_xip_mem(mapping, index, 1,
-                                                       &xip_mem, &xip_pfn);
-                       mutex_unlock(&xip_sparse_mutex);
-                       if (!status)
-                               /* unmap page at pgoff from all other vmas */
-                               __xip_unmap(mapping, index);
-               }
-
-               if (status)
-                       break;
-
-               copied = bytes -
-                       __copy_from_user_nocache(xip_mem + offset, buf, bytes);
-
-               if (likely(copied > 0)) {
-                       status = copied;
-
-                       if (status >= 0) {
-                               written += status;
-                               count -= status;
-                               pos += status;
-                               buf += status;
-                       }
-               }
-               if (unlikely(copied != bytes))
-                       if (status >= 0)
-                               status = -EFAULT;
-               if (status < 0)
-                       break;
-       } while (count);
-       *ppos = pos;
-       /*
-        * No need to use i_size_read() here, the i_size
-        * cannot change under us because we hold i_mutex.
-        */
-       if (pos > inode->i_size) {
-               i_size_write(inode, pos);
-               mark_inode_dirty(inode);
-       }
-
-       return written ? written : status;
-}
-
-ssize_t
-xip_file_write(struct file *filp, const char __user *buf, size_t len,
-              loff_t *ppos)
-{
-       struct address_space *mapping = filp->f_mapping;
-       struct inode *inode = mapping->host;
-       size_t count;
-       loff_t pos;
-       ssize_t ret;
-
-       mutex_lock(&inode->i_mutex);
-
-       if (!access_ok(VERIFY_READ, buf, len)) {
-               ret=-EFAULT;
-               goto out_up;
-       }
-
-       pos = *ppos;
-       count = len;
-
-       /* We can write back this queue in page reclaim */
-       current->backing_dev_info = inode_to_bdi(inode);
-
-       ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
-       if (ret)
-               goto out_backing;
-       if (count == 0)
-               goto out_backing;
-
-       ret = file_remove_suid(filp);
-       if (ret)
-               goto out_backing;
-
-       ret = file_update_time(filp);
-       if (ret)
-               goto out_backing;
-
-       ret = __xip_file_write (filp, buf, count, pos, ppos);
-
- out_backing:
-       current->backing_dev_info = NULL;
- out_up:
-       mutex_unlock(&inode->i_mutex);
-       return ret;
-}
-EXPORT_SYMBOL_GPL(xip_file_write);
-
-/*
- * truncate a page used for execute in place
- * functionality is analog to block_truncate_page but does use get_xip_mem
- * to get the page instead of page cache
- */
-int
-xip_truncate_page(struct address_space *mapping, loff_t from)
-{
-       pgoff_t index = from >> PAGE_CACHE_SHIFT;
-       unsigned offset = from & (PAGE_CACHE_SIZE-1);
-       unsigned blocksize;
-       unsigned length;
-       void *xip_mem;
-       unsigned long xip_pfn;
-       int err;
-
-       BUG_ON(!mapping->a_ops->get_xip_mem);
-
-       blocksize = 1 << mapping->host->i_blkbits;
-       length = offset & (blocksize - 1);
-
-       /* Block boundary? Nothing to do */
-       if (!length)
-               return 0;
-
-       length = blocksize - length;
-
-       err = mapping->a_ops->get_xip_mem(mapping, index, 0,
-                                               &xip_mem, &xip_pfn);
-       if (unlikely(err)) {
-               if (err == -ENODATA)
-                       /* Hole? No need to truncate */
-                       return 0;
-               else
-                       return err;
-       }
-       memset(xip_mem + offset, 0, length);
-       return 0;
-}
-EXPORT_SYMBOL_GPL(xip_truncate_page);
index 1077cbdc8b5207a6b407f3b4d97f4f5e99862065..d551475517bfd8867dca18ea23657216a0d2364a 100644 (file)
@@ -239,7 +239,7 @@ static long madvise_willneed(struct vm_area_struct *vma,
                return -EBADF;
 #endif
 
-       if (file->f_mapping->a_ops->get_xip_mem) {
+       if (IS_DAX(file_inode(file))) {
                /* no bad return value, but ignore advice */
                return 0;
        }
index 99275325f303681230f88372d4e4ef99aa576dcc..8068893697bbdbb5d64f9d43508658d601a6932e 100644 (file)
@@ -1965,6 +1965,7 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
        vmf.pgoff = page->index;
        vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
        vmf.page = page;
+       vmf.cow_page = NULL;
 
        ret = vma->vm_ops->page_mkwrite(vma, &vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
@@ -2329,6 +2330,7 @@ void unmap_mapping_range(struct address_space *mapping,
                details.last_index = ULONG_MAX;
 
 
+       /* DAX uses i_mmap_lock to serialise file truncate vs page fault */
        i_mmap_lock_write(mapping);
        if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
                unmap_mapping_range_tree(&mapping->i_mmap, &details);
@@ -2638,7 +2640,8 @@ oom:
  * See filemap_fault() and __lock_page_retry().
  */
 static int __do_fault(struct vm_area_struct *vma, unsigned long address,
-               pgoff_t pgoff, unsigned int flags, struct page **page)
+                       pgoff_t pgoff, unsigned int flags,
+                       struct page *cow_page, struct page **page)
 {
        struct vm_fault vmf;
        int ret;
@@ -2647,10 +2650,13 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
        vmf.pgoff = pgoff;
        vmf.flags = flags;
        vmf.page = NULL;
+       vmf.cow_page = cow_page;
 
        ret = vma->vm_ops->fault(vma, &vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;
+       if (!vmf.page)
+               goto out;
 
        if (unlikely(PageHWPoison(vmf.page))) {
                if (ret & VM_FAULT_LOCKED)
@@ -2664,6 +2670,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
        else
                VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
 
+ out:
        *page = vmf.page;
        return ret;
 }
@@ -2834,7 +2841,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                pte_unmap_unlock(pte, ptl);
        }
 
-       ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+       ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;
 
@@ -2874,26 +2881,43 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                return VM_FAULT_OOM;
        }
 
-       ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+       ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                goto uncharge_out;
 
-       copy_user_highpage(new_page, fault_page, address, vma);
+       if (fault_page)
+               copy_user_highpage(new_page, fault_page, address, vma);
        __SetPageUptodate(new_page);
 
        pte = pte_offset_map_lock(mm, pmd, address, &ptl);
        if (unlikely(!pte_same(*pte, orig_pte))) {
                pte_unmap_unlock(pte, ptl);
-               unlock_page(fault_page);
-               page_cache_release(fault_page);
+               if (fault_page) {
+                       unlock_page(fault_page);
+                       page_cache_release(fault_page);
+               } else {
+                       /*
+                        * The fault handler has no page to lock, so it holds
+                        * i_mmap_lock for read to protect against truncate.
+                        */
+                       i_mmap_unlock_read(vma->vm_file->f_mapping);
+               }
                goto uncharge_out;
        }
        do_set_pte(vma, address, new_page, pte, true, true);
        mem_cgroup_commit_charge(new_page, memcg, false);
        lru_cache_add_active_or_unevictable(new_page, vma);
        pte_unmap_unlock(pte, ptl);
-       unlock_page(fault_page);
-       page_cache_release(fault_page);
+       if (fault_page) {
+               unlock_page(fault_page);
+               page_cache_release(fault_page);
+       } else {
+               /*
+                * The fault handler has no page to lock, so it holds
+                * i_mmap_lock for read to protect against truncate.
+                */
+               i_mmap_unlock_read(vma->vm_file->f_mapping);
+       }
        return ret;
 uncharge_out:
        mem_cgroup_cancel_charge(new_page, memcg);
@@ -2912,7 +2936,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        int dirtied = 0;
        int ret, tmp;
 
-       ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+       ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;
 
index 6d672836e18722c38cce43149587db32882e6316..0db267d0adc9231de508b4cf9ca7068df8362384 100755 (executable)
@@ -28,7 +28,6 @@ If no config files are specified, .config and .config.old are used.
 Example usage:
  $ diffconfig .config config-with-some-changes
 -EXT2_FS_XATTR  n
--EXT2_FS_XIP  n
  CRAMFS  n -> y
  EXT2_FS  y -> n
  LOG_BUF_SHIFT  14 -> 16