]> git.proxmox.com Git - mirror_ubuntu-eoan-kernel.git/commitdiff
Merge branch 'akpm' (patches from Andrew)
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 25 Feb 2017 18:29:09 +0000 (10:29 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 25 Feb 2017 18:29:09 +0000 (10:29 -0800)
Merge more updates from Andrew Morton:

 - almost all of the rest of MM

 - misc bits

 - KASAN updates

 - procfs

 - lib/ updates

 - checkpatch updates

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (124 commits)
  checkpatch: remove false unbalanced braces warning
  checkpatch: notice unbalanced else braces in a patch
  checkpatch: add another old address for the FSF
  checkpatch: update $logFunctions
  checkpatch: warn on logging continuations
  checkpatch: warn on embedded function names
  lib/lz4: remove back-compat wrappers
  fs/pstore: fs/squashfs: change usage of LZ4 to work with new LZ4 version
  crypto: change LZ4 modules to work with new LZ4 module version
  lib/decompress_unlz4: change module to work with new LZ4 module version
  lib: update LZ4 compressor module
  lib/test_sort.c: make it explicitly non-modular
  lib: add CONFIG_TEST_SORT to enable self-test of sort()
  rbtree: use designated initializers
  linux/kernel.h: fix DIV_ROUND_CLOSEST to support negative divisors
  lib/find_bit.c: micro-optimise find_next_*_bit
  lib: add module support to atomic64 tests
  lib: add module support to glob tests
  lib: add module support to crc32 tests
  kernel/ksysfs.c: add __ro_after_init to bin_attribute structure
  ...

97 files changed:
Documentation/00-INDEX
Documentation/admin-guide/md.rst
Documentation/md-cluster.txt [deleted file]
Documentation/md/md-cluster.txt [new file with mode: 0644]
Documentation/md/raid5-cache.txt [new file with mode: 0644]
Documentation/sparc/console.txt [new file with mode: 0644]
MAINTAINERS
arch/openrisc/Kconfig
arch/openrisc/TODO.openrisc
arch/openrisc/include/asm/Kbuild
arch/openrisc/include/asm/atomic.h [new file with mode: 0644]
arch/openrisc/include/asm/bitops.h
arch/openrisc/include/asm/bitops/atomic.h [new file with mode: 0644]
arch/openrisc/include/asm/cmpxchg.h [new file with mode: 0644]
arch/openrisc/include/asm/cpuinfo.h
arch/openrisc/include/asm/futex.h [new file with mode: 0644]
arch/openrisc/include/asm/spr_defs.h
arch/openrisc/include/asm/string.h [new file with mode: 0644]
arch/openrisc/kernel/.gitignore [new file with mode: 0644]
arch/openrisc/kernel/entry.S
arch/openrisc/kernel/head.S
arch/openrisc/kernel/or32_ksyms.c
arch/openrisc/kernel/process.c
arch/openrisc/kernel/ptrace.c
arch/openrisc/kernel/setup.c
arch/openrisc/kernel/traps.c
arch/openrisc/lib/Makefile
arch/openrisc/lib/memcpy.c [new file with mode: 0644]
arch/openrisc/lib/memset.S [new file with mode: 0644]
arch/openrisc/mm/ioremap.c
arch/sparc/include/asm/page_64.h
arch/sparc/include/asm/pgtable_64.h
arch/sparc/include/asm/setup.h
arch/sparc/include/asm/tlbflush_64.h
arch/sparc/include/asm/topology_64.h
arch/sparc/include/asm/uprobes.h
arch/sparc/kernel/smp_64.c
arch/sparc/kernel/tsb.S
arch/sparc/mm/hugetlbpage.c
arch/sparc/mm/init_64.c
arch/sparc/mm/srmmu.c
arch/sparc/mm/tlb.c
arch/sparc/mm/tsb.c
block/Kconfig.iosched
block/bio.c
block/blk-mq-sched.c
block/blk-mq-sched.h
block/blk-mq.c
block/elevator.c
block/genhd.c
block/sed-opal.c
drivers/block/cciss_scsi.c
drivers/block/nbd.c
drivers/block/sunvdc.c
drivers/md/dm-rq.c
drivers/md/faulty.c
drivers/md/linear.c
drivers/md/linear.h
drivers/md/md.c
drivers/md/md.h
drivers/md/multipath.c
drivers/md/raid0.c
drivers/md/raid1.c
drivers/md/raid1.h
drivers/md/raid10.c
drivers/md/raid5-cache.c
drivers/md/raid5.c
drivers/md/raid5.h
drivers/net/wireless/mac80211_hwsim.c
drivers/nvme/host/core.c
drivers/nvme/host/fabrics.c
drivers/nvme/host/fabrics.h
drivers/nvme/host/fc.c
drivers/nvme/host/nvme.h
drivers/nvme/host/pci.c
drivers/nvme/host/rdma.c
drivers/nvme/target/admin-cmd.c
drivers/nvme/target/core.c
drivers/nvme/target/discovery.c
drivers/nvme/target/fabrics-cmd.c
drivers/nvme/target/fc.c
drivers/nvme/target/loop.c
drivers/nvme/target/nvmet.h
drivers/nvme/target/rdma.c
drivers/scsi/scsi_lib.c
drivers/scsi/scsi_transport_sas.c
drivers/tty/serial/sunhv.c
fs/block_dev.c
include/asm-generic/atomic.h
include/linux/bio.h
include/linux/blk-mq.h
include/linux/nvme-rdma.h
include/linux/nvme.h
include/linux/sed-opal.h
kernel/panic.c
lib/radix-tree.c
scripts/checkstack.pl

index c8a8eb1a2b119c064f038559fa67f6511a31bce6..793acf999e9eac87057af3214ca1f98ad65b922f 100644 (file)
@@ -270,8 +270,8 @@ m68k/
        - directory with info about Linux on Motorola 68k architecture.
 mailbox.txt
        - How to write drivers for the common mailbox framework (IPC).
-md-cluster.txt
-       - info on shared-device RAID MD cluster.
+md/
+       - directory with info about Linux Software RAID
 media/
        - info on media drivers: uAPI, kAPI and driver documentation.
 memory-barriers.txt
index e449fb5f277c25b9b31800561f73a2d2e0d63593..1e61bf50595c84c936cfe8788bb37114a6f7d5f0 100644 (file)
@@ -725,3 +725,8 @@ These currently include:
       to 1.  Setting this to 0 disables bypass accounting and
       requires preread stripes to wait until all full-width stripe-
       writes are complete.  Valid values are 0 to stripe_cache_size.
+
+  journal_mode (currently raid5 only)
+      The cache mode for raid5. raid5 could include an extra disk for
+      caching. The mode can be "write-throuth" and "write-back". The
+      default is "write-through".
diff --git a/Documentation/md-cluster.txt b/Documentation/md-cluster.txt
deleted file mode 100644 (file)
index 3888327..0000000
+++ /dev/null
@@ -1,324 +0,0 @@
-The cluster MD is a shared-device RAID for a cluster.
-
-
-1. On-disk format
-
-Separate write-intent-bitmaps are used for each cluster node.
-The bitmaps record all writes that may have been started on that node,
-and may not yet have finished. The on-disk layout is:
-
-0                    4k                     8k                    12k
--------------------------------------------------------------------
-| idle                | md super            | bm super [0] + bits |
-| bm bits[0, contd]   | bm super[1] + bits  | bm bits[1, contd]   |
-| bm super[2] + bits  | bm bits [2, contd]  | bm super[3] + bits  |
-| bm bits [3, contd]  |                     |                     |
-
-During "normal" functioning we assume the filesystem ensures that only
-one node writes to any given block at a time, so a write request will
-
- - set the appropriate bit (if not already set)
- - commit the write to all mirrors
- - schedule the bit to be cleared after a timeout.
-
-Reads are just handled normally. It is up to the filesystem to ensure
-one node doesn't read from a location where another node (or the same
-node) is writing.
-
-
-2. DLM Locks for management
-
-There are three groups of locks for managing the device:
-
-2.1 Bitmap lock resource (bm_lockres)
-
- The bm_lockres protects individual node bitmaps. They are named in
- the form bitmap000 for node 1, bitmap001 for node 2 and so on. When a
- node joins the cluster, it acquires the lock in PW mode and it stays
- so during the lifetime the node is part of the cluster. The lock
- resource number is based on the slot number returned by the DLM
- subsystem. Since DLM starts node count from one and bitmap slots
- start from zero, one is subtracted from the DLM slot number to arrive
- at the bitmap slot number.
-
- The LVB of the bitmap lock for a particular node records the range
- of sectors that are being re-synced by that node.  No other
- node may write to those sectors.  This is used when a new nodes
- joins the cluster.
-
-2.2 Message passing locks
-
- Each node has to communicate with other nodes when starting or ending
- resync, and for metadata superblock updates.  This communication is
- managed through three locks: "token", "message", and "ack", together
- with the Lock Value Block (LVB) of one of the "message" lock.
-
-2.3 new-device management
-
- A single lock: "no-new-dev" is used to co-ordinate the addition of
- new devices - this must be synchronized across the array.
- Normally all nodes hold a concurrent-read lock on this device.
-
-3. Communication
-
- Messages can be broadcast to all nodes, and the sender waits for all
- other nodes to acknowledge the message before proceeding.  Only one
- message can be processed at a time.
-
-3.1 Message Types
-
- There are six types of messages which are passed:
-
- 3.1.1 METADATA_UPDATED: informs other nodes that the metadata has
-   been updated, and the node must re-read the md superblock. This is
-   performed synchronously. It is primarily used to signal device
-   failure.
-
- 3.1.2 RESYNCING: informs other nodes that a resync is initiated or
-   ended so that each node may suspend or resume the region.  Each
-   RESYNCING message identifies a range of the devices that the
-   sending node is about to resync. This over-rides any pervious
-   notification from that node: only one ranged can be resynced at a
-   time per-node.
-
- 3.1.3 NEWDISK: informs other nodes that a device is being added to
-   the array. Message contains an identifier for that device.  See
-   below for further details.
-
- 3.1.4 REMOVE: A failed or spare device is being removed from the
-   array. The slot-number of the device is included in the message.
-
- 3.1.5 RE_ADD: A failed device is being re-activated - the assumption
-   is that it has been determined to be working again.
-
- 3.1.6 BITMAP_NEEDS_SYNC: if a node is stopped locally but the bitmap
-   isn't clean, then another node is informed to take the ownership of
-   resync.
-
-3.2 Communication mechanism
-
- The DLM LVB is used to communicate within nodes of the cluster. There
- are three resources used for the purpose:
-
-  3.2.1 token: The resource which protects the entire communication
-   system. The node having the token resource is allowed to
-   communicate.
-
-  3.2.2 message: The lock resource which carries the data to
-   communicate.
-
-  3.2.3 ack: The resource, acquiring which means the message has been
-   acknowledged by all nodes in the cluster. The BAST of the resource
-   is used to inform the receiving node that a node wants to
-   communicate.
-
-The algorithm is:
-
- 1. receive status - all nodes have concurrent-reader lock on "ack".
-
-   sender                         receiver                 receiver
-   "ack":CR                       "ack":CR                 "ack":CR
-
- 2. sender get EX on "token"
-    sender get EX on "message"
-    sender                        receiver                 receiver
-    "token":EX                    "ack":CR                 "ack":CR
-    "message":EX
-    "ack":CR
-
-    Sender checks that it still needs to send a message. Messages
-    received or other events that happened while waiting for the
-    "token" may have made this message inappropriate or redundant.
-
- 3. sender writes LVB.
-    sender down-convert "message" from EX to CW
-    sender try to get EX of "ack"
-    [ wait until all receivers have *processed* the "message" ]
-
-                                     [ triggered by bast of "ack" ]
-                                     receiver get CR on "message"
-                                     receiver read LVB
-                                     receiver processes the message
-                                     [ wait finish ]
-                                     receiver releases "ack"
-                                     receiver tries to get PR on "message"
-
-   sender                         receiver                  receiver
-   "token":EX                     "message":CR              "message":CR
-   "message":CW
-   "ack":EX
-
- 4. triggered by grant of EX on "ack" (indicating all receivers
-    have processed message)
-    sender down-converts "ack" from EX to CR
-    sender releases "message"
-    sender releases "token"
-                               receiver upconvert to PR on "message"
-                               receiver get CR of "ack"
-                               receiver release "message"
-
-   sender                      receiver                   receiver
-   "ack":CR                    "ack":CR                   "ack":CR
-
-
-4. Handling Failures
-
-4.1 Node Failure
-
- When a node fails, the DLM informs the cluster with the slot
- number. The node starts a cluster recovery thread. The cluster
- recovery thread:
-
-       - acquires the bitmap<number> lock of the failed node
-       - opens the bitmap
-       - reads the bitmap of the failed node
-       - copies the set bitmap to local node
-       - cleans the bitmap of the failed node
-       - releases bitmap<number> lock of the failed node
-       - initiates resync of the bitmap on the current node
-               md_check_recovery is invoked within recover_bitmaps,
-               then md_check_recovery -> metadata_update_start/finish,
-               it will lock the communication by lock_comm.
-               Which means when one node is resyncing it blocks all
-               other nodes from writing anywhere on the array.
-
- The resync process is the regular md resync. However, in a clustered
- environment when a resync is performed, it needs to tell other nodes
- of the areas which are suspended. Before a resync starts, the node
- send out RESYNCING with the (lo,hi) range of the area which needs to
- be suspended. Each node maintains a suspend_list, which contains the
- list of ranges which are currently suspended. On receiving RESYNCING,
- the node adds the range to the suspend_list. Similarly, when the node
- performing resync finishes, it sends RESYNCING with an empty range to
- other nodes and other nodes remove the corresponding entry from the
- suspend_list.
-
- A helper function, ->area_resyncing() can be used to check if a
- particular I/O range should be suspended or not.
-
-4.2 Device Failure
-
- Device failures are handled and communicated with the metadata update
- routine.  When a node detects a device failure it does not allow
- any further writes to that device until the failure has been
- acknowledged by all other nodes.
-
-5. Adding a new Device
-
- For adding a new device, it is necessary that all nodes "see" the new
- device to be added. For this, the following algorithm is used:
-
-    1. Node 1 issues mdadm --manage /dev/mdX --add /dev/sdYY which issues
-       ioctl(ADD_NEW_DISK with disc.state set to MD_DISK_CLUSTER_ADD)
-    2. Node 1 sends a NEWDISK message with uuid and slot number
-    3. Other nodes issue kobject_uevent_env with uuid and slot number
-       (Steps 4,5 could be a udev rule)
-    4. In userspace, the node searches for the disk, perhaps
-       using blkid -t SUB_UUID=""
-    5. Other nodes issue either of the following depending on whether
-       the disk was found:
-       ioctl(ADD_NEW_DISK with disc.state set to MD_DISK_CANDIDATE and
-             disc.number set to slot number)
-       ioctl(CLUSTERED_DISK_NACK)
-    6. Other nodes drop lock on "no-new-devs" (CR) if device is found
-    7. Node 1 attempts EX lock on "no-new-dev"
-    8. If node 1 gets the lock, it sends METADATA_UPDATED after
-       unmarking the disk as SpareLocal
-    9. If not (get "no-new-dev" lock), it fails the operation and sends
-       METADATA_UPDATED.
-   10. Other nodes get the information whether a disk is added or not
-       by the following METADATA_UPDATED.
-
-6. Module interface.
-
- There are 17 call-backs which the md core can make to the cluster
- module.  Understanding these can give a good overview of the whole
- process.
-
-6.1 join(nodes) and leave()
-
- These are called when an array is started with a clustered bitmap,
- and when the array is stopped.  join() ensures the cluster is
- available and initializes the various resources.
- Only the first 'nodes' nodes in the cluster can use the array.
-
-6.2 slot_number()
-
- Reports the slot number advised by the cluster infrastructure.
- Range is from 0 to nodes-1.
-
-6.3 resync_info_update()
-
- This updates the resync range that is stored in the bitmap lock.
- The starting point is updated as the resync progresses.  The
- end point is always the end of the array.
- It does *not* send a RESYNCING message.
-
-6.4 resync_start(), resync_finish()
-
- These are called when resync/recovery/reshape starts or stops.
- They update the resyncing range in the bitmap lock and also
- send a RESYNCING message.  resync_start reports the whole
- array as resyncing, resync_finish reports none of it.
-
- resync_finish() also sends a BITMAP_NEEDS_SYNC message which
- allows some other node to take over.
-
-6.5 metadata_update_start(), metadata_update_finish(),
-    metadata_update_cancel().
-
- metadata_update_start is used to get exclusive access to
- the metadata.  If a change is still needed once that access is
- gained, metadata_update_finish() will send a METADATA_UPDATE
- message to all other nodes, otherwise metadata_update_cancel()
- can be used to release the lock.
-
-6.6 area_resyncing()
-
- This combines two elements of functionality.
-
- Firstly, it will check if any node is currently resyncing
- anything in a given range of sectors.  If any resync is found,
- then the caller will avoid writing or read-balancing in that
- range.
-
- Secondly, while node recovery is happening it reports that
- all areas are resyncing for READ requests.  This avoids races
- between the cluster-filesystem and the cluster-RAID handling
- a node failure.
-
-6.7 add_new_disk_start(), add_new_disk_finish(), new_disk_ack()
-
- These are used to manage the new-disk protocol described above.
- When a new device is added, add_new_disk_start() is called before
- it is bound to the array and, if that succeeds, add_new_disk_finish()
- is called the device is fully added.
-
- When a device is added in acknowledgement to a previous
- request, or when the device is declared "unavailable",
- new_disk_ack() is called.
-
-6.8 remove_disk()
-
- This is called when a spare or failed device is removed from
- the array.  It causes a REMOVE message to be send to other nodes.
-
-6.9 gather_bitmaps()
-
- This sends a RE_ADD message to all other nodes and then
- gathers bitmap information from all bitmaps.  This combined
- bitmap is then used to recovery the re-added device.
-
-6.10 lock_all_bitmaps() and unlock_all_bitmaps()
-
- These are called when change bitmap to none. If a node plans
- to clear the cluster raid's bitmap, it need to make sure no other
- nodes are using the raid which is achieved by lock all bitmap
- locks within the cluster, and also those locks are unlocked
- accordingly.
-
-7. Unsupported features
-
-There are somethings which are not supported by cluster MD yet.
-
-- update size and change array_sectors.
diff --git a/Documentation/md/md-cluster.txt b/Documentation/md/md-cluster.txt
new file mode 100644 (file)
index 0000000..3888327
--- /dev/null
@@ -0,0 +1,324 @@
+The cluster MD is a shared-device RAID for a cluster.
+
+
+1. On-disk format
+
+Separate write-intent-bitmaps are used for each cluster node.
+The bitmaps record all writes that may have been started on that node,
+and may not yet have finished. The on-disk layout is:
+
+0                    4k                     8k                    12k
+-------------------------------------------------------------------
+| idle                | md super            | bm super [0] + bits |
+| bm bits[0, contd]   | bm super[1] + bits  | bm bits[1, contd]   |
+| bm super[2] + bits  | bm bits [2, contd]  | bm super[3] + bits  |
+| bm bits [3, contd]  |                     |                     |
+
+During "normal" functioning we assume the filesystem ensures that only
+one node writes to any given block at a time, so a write request will
+
+ - set the appropriate bit (if not already set)
+ - commit the write to all mirrors
+ - schedule the bit to be cleared after a timeout.
+
+Reads are just handled normally. It is up to the filesystem to ensure
+one node doesn't read from a location where another node (or the same
+node) is writing.
+
+
+2. DLM Locks for management
+
+There are three groups of locks for managing the device:
+
+2.1 Bitmap lock resource (bm_lockres)
+
+ The bm_lockres protects individual node bitmaps. They are named in
+ the form bitmap000 for node 1, bitmap001 for node 2 and so on. When a
+ node joins the cluster, it acquires the lock in PW mode and it stays
+ so during the lifetime the node is part of the cluster. The lock
+ resource number is based on the slot number returned by the DLM
+ subsystem. Since DLM starts node count from one and bitmap slots
+ start from zero, one is subtracted from the DLM slot number to arrive
+ at the bitmap slot number.
+
+ The LVB of the bitmap lock for a particular node records the range
+ of sectors that are being re-synced by that node.  No other
+ node may write to those sectors.  This is used when a new nodes
+ joins the cluster.
+
+2.2 Message passing locks
+
+ Each node has to communicate with other nodes when starting or ending
+ resync, and for metadata superblock updates.  This communication is
+ managed through three locks: "token", "message", and "ack", together
+ with the Lock Value Block (LVB) of one of the "message" lock.
+
+2.3 new-device management
+
+ A single lock: "no-new-dev" is used to co-ordinate the addition of
+ new devices - this must be synchronized across the array.
+ Normally all nodes hold a concurrent-read lock on this device.
+
+3. Communication
+
+ Messages can be broadcast to all nodes, and the sender waits for all
+ other nodes to acknowledge the message before proceeding.  Only one
+ message can be processed at a time.
+
+3.1 Message Types
+
+ There are six types of messages which are passed:
+
+ 3.1.1 METADATA_UPDATED: informs other nodes that the metadata has
+   been updated, and the node must re-read the md superblock. This is
+   performed synchronously. It is primarily used to signal device
+   failure.
+
+ 3.1.2 RESYNCING: informs other nodes that a resync is initiated or
+   ended so that each node may suspend or resume the region.  Each
+   RESYNCING message identifies a range of the devices that the
+   sending node is about to resync. This over-rides any pervious
+   notification from that node: only one ranged can be resynced at a
+   time per-node.
+
+ 3.1.3 NEWDISK: informs other nodes that a device is being added to
+   the array. Message contains an identifier for that device.  See
+   below for further details.
+
+ 3.1.4 REMOVE: A failed or spare device is being removed from the
+   array. The slot-number of the device is included in the message.
+
+ 3.1.5 RE_ADD: A failed device is being re-activated - the assumption
+   is that it has been determined to be working again.
+
+ 3.1.6 BITMAP_NEEDS_SYNC: if a node is stopped locally but the bitmap
+   isn't clean, then another node is informed to take the ownership of
+   resync.
+
+3.2 Communication mechanism
+
+ The DLM LVB is used to communicate within nodes of the cluster. There
+ are three resources used for the purpose:
+
+  3.2.1 token: The resource which protects the entire communication
+   system. The node having the token resource is allowed to
+   communicate.
+
+  3.2.2 message: The lock resource which carries the data to
+   communicate.
+
+  3.2.3 ack: The resource, acquiring which means the message has been
+   acknowledged by all nodes in the cluster. The BAST of the resource
+   is used to inform the receiving node that a node wants to
+   communicate.
+
+The algorithm is:
+
+ 1. receive status - all nodes have concurrent-reader lock on "ack".
+
+   sender                         receiver                 receiver
+   "ack":CR                       "ack":CR                 "ack":CR
+
+ 2. sender get EX on "token"
+    sender get EX on "message"
+    sender                        receiver                 receiver
+    "token":EX                    "ack":CR                 "ack":CR
+    "message":EX
+    "ack":CR
+
+    Sender checks that it still needs to send a message. Messages
+    received or other events that happened while waiting for the
+    "token" may have made this message inappropriate or redundant.
+
+ 3. sender writes LVB.
+    sender down-convert "message" from EX to CW
+    sender try to get EX of "ack"
+    [ wait until all receivers have *processed* the "message" ]
+
+                                     [ triggered by bast of "ack" ]
+                                     receiver get CR on "message"
+                                     receiver read LVB
+                                     receiver processes the message
+                                     [ wait finish ]
+                                     receiver releases "ack"
+                                     receiver tries to get PR on "message"
+
+   sender                         receiver                  receiver
+   "token":EX                     "message":CR              "message":CR
+   "message":CW
+   "ack":EX
+
+ 4. triggered by grant of EX on "ack" (indicating all receivers
+    have processed message)
+    sender down-converts "ack" from EX to CR
+    sender releases "message"
+    sender releases "token"
+                               receiver upconvert to PR on "message"
+                               receiver get CR of "ack"
+                               receiver release "message"
+
+   sender                      receiver                   receiver
+   "ack":CR                    "ack":CR                   "ack":CR
+
+
+4. Handling Failures
+
+4.1 Node Failure
+
+ When a node fails, the DLM informs the cluster with the slot
+ number. The node starts a cluster recovery thread. The cluster
+ recovery thread:
+
+       - acquires the bitmap<number> lock of the failed node
+       - opens the bitmap
+       - reads the bitmap of the failed node
+       - copies the set bitmap to local node
+       - cleans the bitmap of the failed node
+       - releases bitmap<number> lock of the failed node
+       - initiates resync of the bitmap on the current node
+               md_check_recovery is invoked within recover_bitmaps,
+               then md_check_recovery -> metadata_update_start/finish,
+               it will lock the communication by lock_comm.
+               Which means when one node is resyncing it blocks all
+               other nodes from writing anywhere on the array.
+
+ The resync process is the regular md resync. However, in a clustered
+ environment when a resync is performed, it needs to tell other nodes
+ of the areas which are suspended. Before a resync starts, the node
+ send out RESYNCING with the (lo,hi) range of the area which needs to
+ be suspended. Each node maintains a suspend_list, which contains the
+ list of ranges which are currently suspended. On receiving RESYNCING,
+ the node adds the range to the suspend_list. Similarly, when the node
+ performing resync finishes, it sends RESYNCING with an empty range to
+ other nodes and other nodes remove the corresponding entry from the
+ suspend_list.
+
+ A helper function, ->area_resyncing() can be used to check if a
+ particular I/O range should be suspended or not.
+
+4.2 Device Failure
+
+ Device failures are handled and communicated with the metadata update
+ routine.  When a node detects a device failure it does not allow
+ any further writes to that device until the failure has been
+ acknowledged by all other nodes.
+
+5. Adding a new Device
+
+ For adding a new device, it is necessary that all nodes "see" the new
+ device to be added. For this, the following algorithm is used:
+
+    1. Node 1 issues mdadm --manage /dev/mdX --add /dev/sdYY which issues
+       ioctl(ADD_NEW_DISK with disc.state set to MD_DISK_CLUSTER_ADD)
+    2. Node 1 sends a NEWDISK message with uuid and slot number
+    3. Other nodes issue kobject_uevent_env with uuid and slot number
+       (Steps 4,5 could be a udev rule)
+    4. In userspace, the node searches for the disk, perhaps
+       using blkid -t SUB_UUID=""
+    5. Other nodes issue either of the following depending on whether
+       the disk was found:
+       ioctl(ADD_NEW_DISK with disc.state set to MD_DISK_CANDIDATE and
+             disc.number set to slot number)
+       ioctl(CLUSTERED_DISK_NACK)
+    6. Other nodes drop lock on "no-new-devs" (CR) if device is found
+    7. Node 1 attempts EX lock on "no-new-dev"
+    8. If node 1 gets the lock, it sends METADATA_UPDATED after
+       unmarking the disk as SpareLocal
+    9. If not (get "no-new-dev" lock), it fails the operation and sends
+       METADATA_UPDATED.
+   10. Other nodes get the information whether a disk is added or not
+       by the following METADATA_UPDATED.
+
+6. Module interface.
+
+ There are 17 call-backs which the md core can make to the cluster
+ module.  Understanding these can give a good overview of the whole
+ process.
+
+6.1 join(nodes) and leave()
+
+ These are called when an array is started with a clustered bitmap,
+ and when the array is stopped.  join() ensures the cluster is
+ available and initializes the various resources.
+ Only the first 'nodes' nodes in the cluster can use the array.
+
+6.2 slot_number()
+
+ Reports the slot number advised by the cluster infrastructure.
+ Range is from 0 to nodes-1.
+
+6.3 resync_info_update()
+
+ This updates the resync range that is stored in the bitmap lock.
+ The starting point is updated as the resync progresses.  The
+ end point is always the end of the array.
+ It does *not* send a RESYNCING message.
+
+6.4 resync_start(), resync_finish()
+
+ These are called when resync/recovery/reshape starts or stops.
+ They update the resyncing range in the bitmap lock and also
+ send a RESYNCING message.  resync_start reports the whole
+ array as resyncing, resync_finish reports none of it.
+
+ resync_finish() also sends a BITMAP_NEEDS_SYNC message which
+ allows some other node to take over.
+
+6.5 metadata_update_start(), metadata_update_finish(),
+    metadata_update_cancel().
+
+ metadata_update_start is used to get exclusive access to
+ the metadata.  If a change is still needed once that access is
+ gained, metadata_update_finish() will send a METADATA_UPDATE
+ message to all other nodes, otherwise metadata_update_cancel()
+ can be used to release the lock.
+
+6.6 area_resyncing()
+
+ This combines two elements of functionality.
+
+ Firstly, it will check if any node is currently resyncing
+ anything in a given range of sectors.  If any resync is found,
+ then the caller will avoid writing or read-balancing in that
+ range.
+
+ Secondly, while node recovery is happening it reports that
+ all areas are resyncing for READ requests.  This avoids races
+ between the cluster-filesystem and the cluster-RAID handling
+ a node failure.
+
+6.7 add_new_disk_start(), add_new_disk_finish(), new_disk_ack()
+
+ These are used to manage the new-disk protocol described above.
+ When a new device is added, add_new_disk_start() is called before
+ it is bound to the array and, if that succeeds, add_new_disk_finish()
+ is called the device is fully added.
+
+ When a device is added in acknowledgement to a previous
+ request, or when the device is declared "unavailable",
+ new_disk_ack() is called.
+
+6.8 remove_disk()
+
+ This is called when a spare or failed device is removed from
+ the array.  It causes a REMOVE message to be send to other nodes.
+
+6.9 gather_bitmaps()
+
+ This sends a RE_ADD message to all other nodes and then
+ gathers bitmap information from all bitmaps.  This combined
+ bitmap is then used to recovery the re-added device.
+
+6.10 lock_all_bitmaps() and unlock_all_bitmaps()
+
+ These are called when change bitmap to none. If a node plans
+ to clear the cluster raid's bitmap, it need to make sure no other
+ nodes are using the raid which is achieved by lock all bitmap
+ locks within the cluster, and also those locks are unlocked
+ accordingly.
+
+7. Unsupported features
+
+There are somethings which are not supported by cluster MD yet.
+
+- update size and change array_sectors.
diff --git a/Documentation/md/raid5-cache.txt b/Documentation/md/raid5-cache.txt
new file mode 100644 (file)
index 0000000..2b210f2
--- /dev/null
@@ -0,0 +1,109 @@
+RAID5 cache
+
+Raid 4/5/6 could include an extra disk for data cache besides normal RAID
+disks. The role of RAID disks isn't changed with the cache disk. The cache disk
+caches data to the RAID disks. The cache can be in write-through (supported
+since 4.4) or write-back mode (supported since 4.10). mdadm (supported since
+3.4) has a new option '--write-journal' to create array with cache. Please
+refer to mdadm manual for details. By default (RAID array starts), the cache is
+in write-through mode. A user can switch it to write-back mode by:
+
+echo "write-back" > /sys/block/md0/md/journal_mode
+
+And switch it back to write-through mode by:
+
+echo "write-through" > /sys/block/md0/md/journal_mode
+
+In both modes, all writes to the array will hit cache disk first. This means
+the cache disk must be fast and sustainable.
+
+-------------------------------------
+write-through mode:
+
+This mode mainly fixes the 'write hole' issue. For RAID 4/5/6 array, an unclean
+shutdown can cause data in some stripes to not be in consistent state, eg, data
+and parity don't match. The reason is that a stripe write involves several RAID
+disks and it's possible the writes don't hit all RAID disks yet before the
+unclean shutdown. We call an array degraded if it has inconsistent data. MD
+tries to resync the array to bring it back to normal state. But before the
+resync completes, any system crash will expose the chance of real data
+corruption in the RAID array. This problem is called 'write hole'.
+
+The write-through cache will cache all data on cache disk first. After the data
+is safe on the cache disk, the data will be flushed onto RAID disks. The
+two-step write will guarantee MD can recover correct data after unclean
+shutdown even the array is degraded. Thus the cache can close the 'write hole'.
+
+In write-through mode, MD reports IO completion to upper layer (usually
+filesystems) after the data is safe on RAID disks, so cache disk failure
+doesn't cause data loss. Of course cache disk failure means the array is
+exposed to 'write hole' again.
+
+In write-through mode, the cache disk isn't required to be big. Several
+hundreds megabytes are enough.
+
+--------------------------------------
+write-back mode:
+
+write-back mode fixes the 'write hole' issue too, since all write data is
+cached on cache disk. But the main goal of 'write-back' cache is to speed up
+write. If a write crosses all RAID disks of a stripe, we call it full-stripe
+write. For non-full-stripe writes, MD must read old data before the new parity
+can be calculated. These synchronous reads hurt write throughput. Some writes
+which are sequential but not dispatched in the same time will suffer from this
+overhead too. Write-back cache will aggregate the data and flush the data to
+RAID disks only after the data becomes a full stripe write. This will
+completely avoid the overhead, so it's very helpful for some workloads. A
+typical workload which does sequential write followed by fsync is an example.
+
+In write-back mode, MD reports IO completion to upper layer (usually
+filesystems) right after the data hits cache disk. The data is flushed to raid
+disks later after specific conditions met. So cache disk failure will cause
+data loss.
+
+In write-back mode, MD also caches data in memory. The memory cache includes
+the same data stored on cache disk, so a power loss doesn't cause data loss.
+The memory cache size has performance impact for the array. It's recommended
+the size is big. A user can configure the size by:
+
+echo "2048" > /sys/block/md0/md/stripe_cache_size
+
+Too small cache disk will make the write aggregation less efficient in this
+mode depending on the workloads. It's recommended to use a cache disk with at
+least several gigabytes size in write-back mode.
+
+--------------------------------------
+The implementation:
+
+The write-through and write-back cache use the same disk format. The cache disk
+is organized as a simple write log. The log consists of 'meta data' and 'data'
+pairs. The meta data describes the data. It also includes checksum and sequence
+ID for recovery identification. Data can be IO data and parity data. Data is
+checksumed too. The checksum is stored in the meta data ahead of the data. The
+checksum is an optimization because MD can write meta and data freely without
+worry about the order. MD superblock has a field pointed to the valid meta data
+of log head.
+
+The log implementation is pretty straightforward. The difficult part is the
+order in which MD writes data to cache disk and RAID disks. Specifically, in
+write-through mode, MD calculates parity for IO data, writes both IO data and
+parity to the log, writes the data and parity to RAID disks after the data and
+parity is settled down in log and finally the IO is finished. Read just reads
+from raid disks as usual.
+
+In write-back mode, MD writes IO data to the log and reports IO completion. The
+data is also fully cached in memory at that time, which means read must query
+memory cache. If some conditions are met, MD will flush the data to RAID disks.
+MD will calculate parity for the data and write parity into the log. After this
+is finished, MD will write both data and parity into RAID disks, then MD can
+release the memory cache. The flush conditions could be stripe becomes a full
+stripe write, free cache disk space is low or free in-kernel memory cache space
+is low.
+
+After an unclean shutdown, MD does recovery. MD reads all meta data and data
+from the log. The sequence ID and checksum will help us detect corrupted meta
+data and data. If MD finds a stripe with data and valid parities (1 parity for
+raid4/5 and 2 for raid6), MD will write the data and parities to RAID disks. If
+parities are incompleted, they are discarded. If part of data is corrupted,
+they are discarded too. MD then loads valid data and writes them to RAID disks
+in normal way.
diff --git a/Documentation/sparc/console.txt b/Documentation/sparc/console.txt
new file mode 100644 (file)
index 0000000..5aa735a
--- /dev/null
@@ -0,0 +1,9 @@
+Steps for sending 'break' on sunhv console:
+===========================================
+
+On Baremetal:
+   1. press   Esc + 'B'
+
+On LDOM:
+   1. press    Ctrl + ']'
+   2. telnet> send  break
index 4b03c4701030202c7ed91e2db62885e9d915e8ea..8f05facab3b5fcbb6f2e367128319a9ead670cf4 100644 (file)
@@ -9315,6 +9315,7 @@ OPENRISC ARCHITECTURE
 M:     Jonas Bonn <jonas@southpole.se>
 M:     Stefan Kristiansson <stefan.kristiansson@saunalahti.fi>
 M:     Stafford Horne <shorne@gmail.com>
+T:     git git://github.com/openrisc/linux.git
 L:     openrisc@lists.librecores.org
 W:     http://openrisc.io
 S:     Maintained
index 8d22015fde3e053dc95b5f82aaa94d6c8ee3d2ce..1e95920b07377417415c8622201d8e04bfd1f1a7 100644 (file)
@@ -12,6 +12,7 @@ config OPENRISC
        select HAVE_MEMBLOCK
        select GPIOLIB
         select HAVE_ARCH_TRACEHOOK
+       select SPARSE_IRQ
        select GENERIC_IRQ_CHIP
        select GENERIC_IRQ_PROBE
        select GENERIC_IRQ_SHOW
index 0eb04c8240f95fb6715703a439280d6c358e077e..c43d4e1d14eb9c576b1438d19e479d147a642c97 100644 (file)
@@ -10,4 +10,3 @@ that are due for investigation shortly, i.e. our TODO list:
    or1k and this change is slowly trickling through the stack.  For the time
    being, or32 is equivalent to or1k.
 
--- Implement optimized version of memcpy and memset
index ef8d1ccc3e450eaaaaaa8b199e7bf8b7a5b5436d..fb241757f7f0a7d6c2ebf8ca79f497efca8af994 100644 (file)
@@ -1,7 +1,6 @@
 
 header-y += ucontext.h
 
-generic-y += atomic.h
 generic-y += auxvec.h
 generic-y += barrier.h
 generic-y += bitsperlong.h
@@ -10,8 +9,6 @@ generic-y += bugs.h
 generic-y += cacheflush.h
 generic-y += checksum.h
 generic-y += clkdev.h
-generic-y += cmpxchg-local.h
-generic-y += cmpxchg.h
 generic-y += current.h
 generic-y += device.h
 generic-y += div64.h
@@ -22,12 +19,12 @@ generic-y += exec.h
 generic-y += fb.h
 generic-y += fcntl.h
 generic-y += ftrace.h
-generic-y += futex.h
 generic-y += hardirq.h
 generic-y += hw_irq.h
 generic-y += ioctl.h
 generic-y += ioctls.h
 generic-y += ipcbuf.h
+generic-y += irq.h
 generic-y += irq_regs.h
 generic-y += irq_work.h
 generic-y += kdebug.h
diff --git a/arch/openrisc/include/asm/atomic.h b/arch/openrisc/include/asm/atomic.h
new file mode 100644 (file)
index 0000000..146e166
--- /dev/null
@@ -0,0 +1,126 @@
+/*
+ * Copyright (C) 2014 Stefan Kristiansson <stefan.kristiansson@saunalahti.fi>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2.  This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+
+#ifndef __ASM_OPENRISC_ATOMIC_H
+#define __ASM_OPENRISC_ATOMIC_H
+
+#include <linux/types.h>
+
+/* Atomically perform op with v->counter and i */
+#define ATOMIC_OP(op)                                                  \
+static inline void atomic_##op(int i, atomic_t *v)                     \
+{                                                                      \
+       int tmp;                                                        \
+                                                                       \
+       __asm__ __volatile__(                                           \
+               "1:     l.lwa   %0,0(%1)        \n"                     \
+               "       l." #op " %0,%0,%2      \n"                     \
+               "       l.swa   0(%1),%0        \n"                     \
+               "       l.bnf   1b              \n"                     \
+               "        l.nop                  \n"                     \
+               : "=&r"(tmp)                                            \
+               : "r"(&v->counter), "r"(i)                              \
+               : "cc", "memory");                                      \
+}
+
+/* Atomically perform op with v->counter and i, return the result */
+#define ATOMIC_OP_RETURN(op)                                           \
+static inline int atomic_##op##_return(int i, atomic_t *v)             \
+{                                                                      \
+       int tmp;                                                        \
+                                                                       \
+       __asm__ __volatile__(                                           \
+               "1:     l.lwa   %0,0(%1)        \n"                     \
+               "       l." #op " %0,%0,%2      \n"                     \
+               "       l.swa   0(%1),%0        \n"                     \
+               "       l.bnf   1b              \n"                     \
+               "        l.nop                  \n"                     \
+               : "=&r"(tmp)                                            \
+               : "r"(&v->counter), "r"(i)                              \
+               : "cc", "memory");                                      \
+                                                                       \
+       return tmp;                                                     \
+}
+
+/* Atomically perform op with v->counter and i, return orig v->counter */
+#define ATOMIC_FETCH_OP(op)                                            \
+static inline int atomic_fetch_##op(int i, atomic_t *v)                        \
+{                                                                      \
+       int tmp, old;                                                   \
+                                                                       \
+       __asm__ __volatile__(                                           \
+               "1:     l.lwa   %0,0(%2)        \n"                     \
+               "       l." #op " %1,%0,%3      \n"                     \
+               "       l.swa   0(%2),%1        \n"                     \
+               "       l.bnf   1b              \n"                     \
+               "        l.nop                  \n"                     \
+               : "=&r"(old), "=&r"(tmp)                                \
+               : "r"(&v->counter), "r"(i)                              \
+               : "cc", "memory");                                      \
+                                                                       \
+       return old;                                                     \
+}
+
+ATOMIC_OP_RETURN(add)
+ATOMIC_OP_RETURN(sub)
+
+ATOMIC_FETCH_OP(add)
+ATOMIC_FETCH_OP(sub)
+ATOMIC_FETCH_OP(and)
+ATOMIC_FETCH_OP(or)
+ATOMIC_FETCH_OP(xor)
+
+ATOMIC_OP(and)
+ATOMIC_OP(or)
+ATOMIC_OP(xor)
+
+#undef ATOMIC_FETCH_OP
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
+
+#define atomic_add_return      atomic_add_return
+#define atomic_sub_return      atomic_sub_return
+#define atomic_fetch_add       atomic_fetch_add
+#define atomic_fetch_sub       atomic_fetch_sub
+#define atomic_fetch_and       atomic_fetch_and
+#define atomic_fetch_or                atomic_fetch_or
+#define atomic_fetch_xor       atomic_fetch_xor
+#define atomic_and     atomic_and
+#define atomic_or      atomic_or
+#define atomic_xor     atomic_xor
+
+/*
+ * Atomically add a to v->counter as long as v is not already u.
+ * Returns the original value at v->counter.
+ *
+ * This is often used through atomic_inc_not_zero()
+ */
+static inline int __atomic_add_unless(atomic_t *v, int a, int u)
+{
+       int old, tmp;
+
+       __asm__ __volatile__(
+               "1:     l.lwa %0, 0(%2)         \n"
+               "       l.sfeq %0, %4           \n"
+               "       l.bf 2f                 \n"
+               "        l.add %1, %0, %3       \n"
+               "       l.swa 0(%2), %1         \n"
+               "       l.bnf 1b                \n"
+               "        l.nop                  \n"
+               "2:                             \n"
+               : "=&r"(old), "=&r" (tmp)
+               : "r"(&v->counter), "r"(a), "r"(u)
+               : "cc", "memory");
+
+       return old;
+}
+#define __atomic_add_unless    __atomic_add_unless
+
+#include <asm-generic/atomic.h>
+
+#endif /* __ASM_OPENRISC_ATOMIC_H */
index 3003cdad561bdac6fa75031f2275373fbd9bec4d..689f56819d53b3f4815e46a1b44044485ea2f00c 100644 (file)
@@ -45,7 +45,7 @@
 #include <asm-generic/bitops/hweight.h>
 #include <asm-generic/bitops/lock.h>
 
-#include <asm-generic/bitops/atomic.h>
+#include <asm/bitops/atomic.h>
 #include <asm-generic/bitops/non-atomic.h>
 #include <asm-generic/bitops/le.h>
 #include <asm-generic/bitops/ext2-atomic.h>
diff --git a/arch/openrisc/include/asm/bitops/atomic.h b/arch/openrisc/include/asm/bitops/atomic.h
new file mode 100644 (file)
index 0000000..35fb85f
--- /dev/null
@@ -0,0 +1,123 @@
+/*
+ * Copyright (C) 2014 Stefan Kristiansson <stefan.kristiansson@saunalahti.fi>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2.  This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+
+#ifndef __ASM_OPENRISC_BITOPS_ATOMIC_H
+#define __ASM_OPENRISC_BITOPS_ATOMIC_H
+
+static inline void set_bit(int nr, volatile unsigned long *addr)
+{
+       unsigned long mask = BIT_MASK(nr);
+       unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+       unsigned long tmp;
+
+       __asm__ __volatile__(
+               "1:     l.lwa   %0,0(%1)        \n"
+               "       l.or    %0,%0,%2        \n"
+               "       l.swa   0(%1),%0        \n"
+               "       l.bnf   1b              \n"
+               "        l.nop                  \n"
+               : "=&r"(tmp)
+               : "r"(p), "r"(mask)
+               : "cc", "memory");
+}
+
+static inline void clear_bit(int nr, volatile unsigned long *addr)
+{
+       unsigned long mask = BIT_MASK(nr);
+       unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+       unsigned long tmp;
+
+       __asm__ __volatile__(
+               "1:     l.lwa   %0,0(%1)        \n"
+               "       l.and   %0,%0,%2        \n"
+               "       l.swa   0(%1),%0        \n"
+               "       l.bnf   1b              \n"
+               "        l.nop                  \n"
+               : "=&r"(tmp)
+               : "r"(p), "r"(~mask)
+               : "cc", "memory");
+}
+
+static inline void change_bit(int nr, volatile unsigned long *addr)
+{
+       unsigned long mask = BIT_MASK(nr);
+       unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+       unsigned long tmp;
+
+       __asm__ __volatile__(
+               "1:     l.lwa   %0,0(%1)        \n"
+               "       l.xor   %0,%0,%2        \n"
+               "       l.swa   0(%1),%0        \n"
+               "       l.bnf   1b              \n"
+               "        l.nop                  \n"
+               : "=&r"(tmp)
+               : "r"(p), "r"(mask)
+               : "cc", "memory");
+}
+
+static inline int test_and_set_bit(int nr, volatile unsigned long *addr)
+{
+       unsigned long mask = BIT_MASK(nr);
+       unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+       unsigned long old;
+       unsigned long tmp;
+
+       __asm__ __volatile__(
+               "1:     l.lwa   %0,0(%2)        \n"
+               "       l.or    %1,%0,%3        \n"
+               "       l.swa   0(%2),%1        \n"
+               "       l.bnf   1b              \n"
+               "        l.nop                  \n"
+               : "=&r"(old), "=&r"(tmp)
+               : "r"(p), "r"(mask)
+               : "cc", "memory");
+
+       return (old & mask) != 0;
+}
+
+static inline int test_and_clear_bit(int nr, volatile unsigned long *addr)
+{
+       unsigned long mask = BIT_MASK(nr);
+       unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+       unsigned long old;
+       unsigned long tmp;
+
+       __asm__ __volatile__(
+               "1:     l.lwa   %0,0(%2)        \n"
+               "       l.and   %1,%0,%3        \n"
+               "       l.swa   0(%2),%1        \n"
+               "       l.bnf   1b              \n"
+               "        l.nop                  \n"
+               : "=&r"(old), "=&r"(tmp)
+               : "r"(p), "r"(~mask)
+               : "cc", "memory");
+
+       return (old & mask) != 0;
+}
+
+static inline int test_and_change_bit(int nr, volatile unsigned long *addr)
+{
+       unsigned long mask = BIT_MASK(nr);
+       unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+       unsigned long old;
+       unsigned long tmp;
+
+       __asm__ __volatile__(
+               "1:     l.lwa   %0,0(%2)        \n"
+               "       l.xor   %1,%0,%3        \n"
+               "       l.swa   0(%2),%1        \n"
+               "       l.bnf   1b              \n"
+               "        l.nop                  \n"
+               : "=&r"(old), "=&r"(tmp)
+               : "r"(p), "r"(mask)
+               : "cc", "memory");
+
+       return (old & mask) != 0;
+}
+
+#endif /* __ASM_OPENRISC_BITOPS_ATOMIC_H */
diff --git a/arch/openrisc/include/asm/cmpxchg.h b/arch/openrisc/include/asm/cmpxchg.h
new file mode 100644 (file)
index 0000000..5fcb9ac
--- /dev/null
@@ -0,0 +1,83 @@
+/*
+ * Copyright (C) 2014 Stefan Kristiansson <stefan.kristiansson@saunalahti.fi>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2.  This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+
+#ifndef __ASM_OPENRISC_CMPXCHG_H
+#define __ASM_OPENRISC_CMPXCHG_H
+
+#include  <linux/types.h>
+
+/*
+ * This function doesn't exist, so you'll get a linker error
+ * if something tries to do an invalid cmpxchg().
+ */
+extern void __cmpxchg_called_with_bad_pointer(void);
+
+#define __HAVE_ARCH_CMPXCHG 1
+
+static inline unsigned long
+__cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size)
+{
+       if (size != 4) {
+               __cmpxchg_called_with_bad_pointer();
+               return old;
+       }
+
+       __asm__ __volatile__(
+               "1:     l.lwa %0, 0(%1)         \n"
+               "       l.sfeq %0, %2           \n"
+               "       l.bnf 2f                \n"
+               "        l.nop                  \n"
+               "       l.swa 0(%1), %3         \n"
+               "       l.bnf 1b                \n"
+               "        l.nop                  \n"
+               "2:                             \n"
+               : "=&r"(old)
+               : "r"(ptr), "r"(old), "r"(new)
+               : "cc", "memory");
+
+       return old;
+}
+
+#define cmpxchg(ptr, o, n)                                             \
+       ({                                                              \
+               (__typeof__(*(ptr))) __cmpxchg((ptr),                   \
+                                              (unsigned long)(o),      \
+                                              (unsigned long)(n),      \
+                                              sizeof(*(ptr)));         \
+       })
+
+/*
+ * This function doesn't exist, so you'll get a linker error if
+ * something tries to do an invalidly-sized xchg().
+ */
+extern void __xchg_called_with_bad_pointer(void);
+
+static inline unsigned long __xchg(unsigned long val, volatile void *ptr,
+                                  int size)
+{
+       if (size != 4) {
+               __xchg_called_with_bad_pointer();
+               return val;
+       }
+
+       __asm__ __volatile__(
+               "1:     l.lwa %0, 0(%1)         \n"
+               "       l.swa 0(%1), %2         \n"
+               "       l.bnf 1b                \n"
+               "        l.nop                  \n"
+               : "=&r"(val)
+               : "r"(ptr), "r"(val)
+               : "cc", "memory");
+
+       return val;
+}
+
+#define xchg(ptr, with) \
+       ((typeof(*(ptr)))__xchg((unsigned long)(with), (ptr), sizeof(*(ptr))))
+
+#endif /* __ASM_OPENRISC_CMPXCHG_H */
index 917318b6a970f533fbd4c8d576f42d63327e7b00..ec10679d6429d5f67bf43f77e990c6eecf0a004f 100644 (file)
@@ -24,9 +24,11 @@ struct cpuinfo {
 
        u32 icache_size;
        u32 icache_block_size;
+       u32 icache_ways;
 
        u32 dcache_size;
        u32 dcache_block_size;
+       u32 dcache_ways;
 };
 
 extern struct cpuinfo cpuinfo;
diff --git a/arch/openrisc/include/asm/futex.h b/arch/openrisc/include/asm/futex.h
new file mode 100644 (file)
index 0000000..7780873
--- /dev/null
@@ -0,0 +1,135 @@
+#ifndef __ASM_OPENRISC_FUTEX_H
+#define __ASM_OPENRISC_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <linux/uaccess.h>
+#include <asm/errno.h>
+
+#define __futex_atomic_op(insn, ret, oldval, uaddr, oparg) \
+({                                                             \
+       __asm__ __volatile__ (                                  \
+               "1:     l.lwa   %0, %2                  \n"     \
+                       insn                            "\n"    \
+               "2:     l.swa   %2, %1                  \n"     \
+               "       l.bnf   1b                      \n"     \
+               "        l.ori  %1, r0, 0               \n"     \
+               "3:                                     \n"     \
+               ".section .fixup,\"ax\"                 \n"     \
+               "4:     l.j     3b                      \n"     \
+               "        l.addi %1, r0, %3              \n"     \
+               ".previous                              \n"     \
+               ".section __ex_table,\"a\"              \n"     \
+               ".word  1b,4b,2b,4b                     \n"     \
+               ".previous                              \n"     \
+               : "=&r" (oldval), "=&r" (ret), "+m" (*uaddr)    \
+               : "i" (-EFAULT), "r" (oparg)                    \
+               : "cc", "memory"                                \
+               );                                              \
+})
+
+static inline int
+futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+{
+       int op = (encoded_op >> 28) & 7;
+       int cmp = (encoded_op >> 24) & 15;
+       int oparg = (encoded_op << 8) >> 20;
+       int cmparg = (encoded_op << 20) >> 20;
+       int oldval = 0, ret;
+
+       if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+               oparg = 1 << oparg;
+
+       if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
+               return -EFAULT;
+
+       pagefault_disable();
+
+       switch (op) {
+       case FUTEX_OP_SET:
+               __futex_atomic_op("l.or %1,%4,%4", ret, oldval, uaddr, oparg);
+               break;
+       case FUTEX_OP_ADD:
+               __futex_atomic_op("l.add %1,%0,%4", ret, oldval, uaddr, oparg);
+               break;
+       case FUTEX_OP_OR:
+               __futex_atomic_op("l.or %1,%0,%4", ret, oldval, uaddr, oparg);
+               break;
+       case FUTEX_OP_ANDN:
+               __futex_atomic_op("l.and %1,%0,%4", ret, oldval, uaddr, ~oparg);
+               break;
+       case FUTEX_OP_XOR:
+               __futex_atomic_op("l.xor %1,%0,%4", ret, oldval, uaddr, oparg);
+               break;
+       default:
+               ret = -ENOSYS;
+       }
+
+       pagefault_enable();
+
+       if (!ret) {
+               switch (cmp) {
+               case FUTEX_OP_CMP_EQ:
+                       ret = (oldval == cmparg);
+                       break;
+               case FUTEX_OP_CMP_NE:
+                       ret = (oldval != cmparg);
+                       break;
+               case FUTEX_OP_CMP_LT:
+                       ret = (oldval < cmparg);
+                       break;
+               case FUTEX_OP_CMP_GE:
+                       ret = (oldval >= cmparg);
+                       break;
+               case FUTEX_OP_CMP_LE:
+                       ret = (oldval <= cmparg);
+                       break;
+               case FUTEX_OP_CMP_GT:
+                       ret = (oldval > cmparg);
+                       break;
+               default:
+                       ret = -ENOSYS;
+               }
+       }
+       return ret;
+}
+
+static inline int
+futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
+                             u32 oldval, u32 newval)
+{
+       int ret = 0;
+       u32 prev;
+
+       if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
+               return -EFAULT;
+
+       __asm__ __volatile__ (                          \
+               "1:     l.lwa   %1, %2          \n"     \
+               "       l.sfeq  %1, %3          \n"     \
+               "       l.bnf   3f              \n"     \
+               "        l.nop                  \n"     \
+               "2:     l.swa   %2, %4          \n"     \
+               "       l.bnf   1b              \n"     \
+               "        l.nop                  \n"     \
+               "3:                             \n"     \
+               ".section .fixup,\"ax\"         \n"     \
+               "4:     l.j     3b              \n"     \
+               "        l.addi %0, r0, %5      \n"     \
+               ".previous                      \n"     \
+               ".section __ex_table,\"a\"      \n"     \
+               ".word  1b,4b,2b,4b             \n"     \
+               ".previous                      \n"     \
+               : "+r" (ret), "=&r" (prev), "+m" (*uaddr) \
+               : "r" (oldval), "r" (newval), "i" (-EFAULT) \
+               : "cc", "memory"                        \
+               );
+
+       *uval = prev;
+       return ret;
+}
+
+#endif /* __KERNEL__ */
+
+#endif /* __ASM_OPENRISC_FUTEX_H */
index 5dbc668865c48f24825bf4a98bc38b6eab70387c..367dac70326af01731e8d841fa8fd5ce49c36222 100644 (file)
 #define SPR_UPR_MP        0x00000020  /* MAC present */
 #define SPR_UPR_DUP       0x00000040  /* Debug unit present */
 #define SPR_UPR_PCUP      0x00000080  /* Performance counters unit present */
-#define SPR_UPR_PMP       0x00000100  /* Power management present */
-#define SPR_UPR_PICP      0x00000200  /* PIC present */
+#define SPR_UPR_PICP      0x00000100  /* PIC present */
+#define SPR_UPR_PMP       0x00000200  /* Power management present */
 #define SPR_UPR_TTP       0x00000400  /* Tick timer present */
 #define SPR_UPR_RES       0x00fe0000  /* Reserved */
 #define SPR_UPR_CUP       0xff000000  /* Context units present */
diff --git a/arch/openrisc/include/asm/string.h b/arch/openrisc/include/asm/string.h
new file mode 100644 (file)
index 0000000..64939cc
--- /dev/null
@@ -0,0 +1,10 @@
+#ifndef __ASM_OPENRISC_STRING_H
+#define __ASM_OPENRISC_STRING_H
+
+#define __HAVE_ARCH_MEMSET
+extern void *memset(void *s, int c, __kernel_size_t n);
+
+#define __HAVE_ARCH_MEMCPY
+extern void *memcpy(void *dest, __const void *src, __kernel_size_t n);
+
+#endif /* __ASM_OPENRISC_STRING_H */
diff --git a/arch/openrisc/kernel/.gitignore b/arch/openrisc/kernel/.gitignore
new file mode 100644 (file)
index 0000000..c5f676c
--- /dev/null
@@ -0,0 +1 @@
+vmlinux.lds
index aac0bde3330c4daa7366aca30a6f8aa810a4b3aa..bc6500860f4d91c11fa2d420bc49f35d83e03228 100644 (file)
@@ -173,6 +173,11 @@ handler:                                                   ;\
        l.j     _ret_from_exception                             ;\
         l.nop
 
+/* clobbers 'reg' */
+#define CLEAR_LWA_FLAG(reg)            \
+       l.movhi reg,hi(lwa_flag)        ;\
+       l.ori   reg,reg,lo(lwa_flag)    ;\
+       l.sw    0(reg),r0
 /*
  * NOTE: one should never assume that SPR_EPC, SPR_ESR, SPR_EEAR
  *       contain the same values as when exception we're handling
@@ -193,6 +198,7 @@ EXCEPTION_ENTRY(_tng_kernel_start)
 /* ---[ 0x200: BUS exception ]------------------------------------------- */
 
 EXCEPTION_ENTRY(_bus_fault_handler)
+       CLEAR_LWA_FLAG(r3)
        /* r4: EA of fault (set by EXCEPTION_HANDLE) */
        l.jal   do_bus_fault
         l.addi  r3,r1,0 /* pt_regs */
@@ -202,11 +208,13 @@ EXCEPTION_ENTRY(_bus_fault_handler)
 
 /* ---[ 0x300: Data Page Fault exception ]------------------------------- */
 EXCEPTION_ENTRY(_dtlb_miss_page_fault_handler)
+       CLEAR_LWA_FLAG(r3)
        l.and   r5,r5,r0
        l.j     1f
         l.nop
 
 EXCEPTION_ENTRY(_data_page_fault_handler)
+       CLEAR_LWA_FLAG(r3)
        /* set up parameters for do_page_fault */
        l.ori   r5,r0,0x300                // exception vector
 1:
@@ -220,7 +228,7 @@ EXCEPTION_ENTRY(_data_page_fault_handler)
         * DTLB miss handler in the CONFIG_GUARD_PROTECTED_CORE part
         */
 #ifdef CONFIG_OPENRISC_NO_SPR_SR_DSX
-       l.lwz   r6,PT_PC(r3)                  // address of an offending insn
+       l.lwz   r6,PT_PC(r3)               // address of an offending insn
        l.lwz   r6,0(r6)                   // instruction that caused pf
 
        l.srli  r6,r6,26                   // check opcode for jump insn
@@ -236,57 +244,57 @@ EXCEPTION_ENTRY(_data_page_fault_handler)
        l.bf    8f
        l.sfeqi r6,0x12                    // l.jalr
        l.bf    8f
-
-       l.nop
+        l.nop
 
        l.j     9f
-       l.nop
-8:
+        l.nop
 
-       l.lwz   r6,PT_PC(r3)                  // address of an offending insn
+8: // offending insn is in delay slot
+       l.lwz   r6,PT_PC(r3)               // address of an offending insn
        l.addi  r6,r6,4
        l.lwz   r6,0(r6)                   // instruction that caused pf
        l.srli  r6,r6,26                   // get opcode
-9:
+9: // offending instruction opcode loaded in r6
 
 #else
 
-       l.mfspr r6,r0,SPR_SR               // SR
-//     l.lwz   r6,PT_SR(r3)               // ESR
-       l.andi  r6,r6,SPR_SR_DSX           // check for delay slot exception
-       l.sfeqi r6,0x1                     // exception happened in delay slot
-       l.bnf   7f
-       l.lwz   r6,PT_PC(r3)               // address of an offending insn
+       l.lwz   r6,PT_SR(r3)               // SR
+       l.andi  r6,r6,SPR_SR_DSX           // check for delay slot exception
+       l.sfne  r6,r0                      // exception happened in delay slot
+       l.bnf   7f
+        l.lwz  r6,PT_PC(r3)               // address of an offending insn
 
-       l.addi  r6,r6,4                    // offending insn is in delay slot
+       l.addi  r6,r6,4                    // offending insn is in delay slot
 7:
        l.lwz   r6,0(r6)                   // instruction that caused pf
        l.srli  r6,r6,26                   // check opcode for write access
 #endif
 
-       l.sfgeui r6,0x33                   // check opcode for write access
+       l.sfgeui r6,0x33                   // check opcode for write access
        l.bnf   1f
        l.sfleui r6,0x37
        l.bnf   1f
        l.ori   r6,r0,0x1                  // write access
        l.j     2f
-       l.nop
+        l.nop
 1:     l.ori   r6,r0,0x0                  // !write access
 2:
 
        /* call fault.c handler in or32/mm/fault.c */
        l.jal   do_page_fault
-       l.nop
+        l.nop
        l.j     _ret_from_exception
-       l.nop
+        l.nop
 
 /* ---[ 0x400: Insn Page Fault exception ]------------------------------- */
 EXCEPTION_ENTRY(_itlb_miss_page_fault_handler)
+       CLEAR_LWA_FLAG(r3)
        l.and   r5,r5,r0
        l.j     1f
         l.nop
 
 EXCEPTION_ENTRY(_insn_page_fault_handler)
+       CLEAR_LWA_FLAG(r3)
        /* set up parameters for do_page_fault */
        l.ori   r5,r0,0x400                // exception vector
 1:
@@ -296,14 +304,15 @@ EXCEPTION_ENTRY(_insn_page_fault_handler)
 
        /* call fault.c handler in or32/mm/fault.c */
        l.jal   do_page_fault
-       l.nop
+        l.nop
        l.j     _ret_from_exception
-       l.nop
+        l.nop
 
 
 /* ---[ 0x500: Timer exception ]----------------------------------------- */
 
 EXCEPTION_ENTRY(_timer_handler)
+       CLEAR_LWA_FLAG(r3)
        l.jal   timer_interrupt
         l.addi r3,r1,0 /* pt_regs */
 
@@ -313,6 +322,7 @@ EXCEPTION_ENTRY(_timer_handler)
 /* ---[ 0x600: Aligment exception ]-------------------------------------- */
 
 EXCEPTION_ENTRY(_alignment_handler)
+       CLEAR_LWA_FLAG(r3)
        /* r4: EA of fault (set by EXCEPTION_HANDLE) */
        l.jal   do_unaligned_access
         l.addi  r3,r1,0 /* pt_regs */
@@ -509,6 +519,7 @@ EXCEPTION_ENTRY(_external_irq_handler)
 //     l.sw    PT_SR(r1),r4
 1:
 #endif
+       CLEAR_LWA_FLAG(r3)
        l.addi  r3,r1,0
        l.movhi r8,hi(do_IRQ)
        l.ori   r8,r8,lo(do_IRQ)
@@ -556,8 +567,12 @@ ENTRY(_sys_call_handler)
         * they should be clobbered, otherwise
         */
        l.sw    PT_GPR3(r1),r3
-       /* r4 already saved */
-       /* r4 holds the EEAR address of the fault, load the original r4 */
+       /*
+        * r4 already saved
+        * r4 holds the EEAR address of the fault, use it as screatch reg and
+        * then load the original r4
+        */
+       CLEAR_LWA_FLAG(r4)
        l.lwz   r4,PT_GPR4(r1)
        l.sw    PT_GPR5(r1),r5
        l.sw    PT_GPR6(r1),r6
@@ -776,6 +791,7 @@ UNHANDLED_EXCEPTION(_vector_0xd00,0xd00)
 /* ---[ 0xe00: Trap exception ]------------------------------------------ */
 
 EXCEPTION_ENTRY(_trap_handler)
+       CLEAR_LWA_FLAG(r3)
        /* r4: EA of fault (set by EXCEPTION_HANDLE) */
        l.jal   do_trap
         l.addi  r3,r1,0 /* pt_regs */
index f14793306b03f35681afebddf873e794148b40d9..d01b82eace3e67bfaac96aff712747e56247fdd0 100644 (file)
@@ -24,6 +24,7 @@
 #include <asm/page.h>
 #include <asm/mmu.h>
 #include <asm/pgtable.h>
+#include <asm/thread_info.h>
 #include <asm/cache.h>
 #include <asm/spr_defs.h>
 #include <asm/asm-offsets.h>
@@ -34,7 +35,7 @@
        l.add   rd,rd,rs
 
 #define CLEAR_GPR(gpr)                         \
-       l.or    gpr,r0,r0
+       l.movhi gpr,0x0
 
 #define LOAD_SYMBOL_2_GPR(gpr,symbol)          \
        l.movhi gpr,hi(symbol)                  ;\
@@ -442,6 +443,9 @@ _dispatch_do_ipage_fault:
        __HEAD
        .global _start
 _start:
+       /* Init r0 to zero as per spec */
+       CLEAR_GPR(r0)
+
        /* save kernel parameters */
        l.or    r25,r0,r3       /* pointer to fdt */
 
@@ -486,7 +490,8 @@ _start:
        /*
         * set up initial ksp and current
         */
-       LOAD_SYMBOL_2_GPR(r1,init_thread_union+0x2000)  // setup kernel stack
+       /* setup kernel stack */
+       LOAD_SYMBOL_2_GPR(r1,init_thread_union + THREAD_SIZE)
        LOAD_SYMBOL_2_GPR(r10,init_thread_union)        // setup current
        tophys  (r31,r10)
        l.sw    TI_KSP(r31), r1
@@ -520,22 +525,8 @@ enable_dc:
         l.nop
 
 flush_tlb:
-       /*
-        *  I N V A L I D A T E   T L B   e n t r i e s
-        */
-       LOAD_SYMBOL_2_GPR(r5,SPR_DTLBMR_BASE(0))
-       LOAD_SYMBOL_2_GPR(r6,SPR_ITLBMR_BASE(0))
-       l.addi  r7,r0,128 /* Maximum number of sets */
-1:
-       l.mtspr r5,r0,0x0
-       l.mtspr r6,r0,0x0
-
-       l.addi  r5,r5,1
-       l.addi  r6,r6,1
-       l.sfeq  r7,r0
-       l.bnf   1b
-        l.addi r7,r7,-1
-
+       l.jal   _flush_tlb
+        l.nop
 
 /* The MMU needs to be enabled before or32_early_setup is called */
 
@@ -627,6 +618,26 @@ jump_start_kernel:
        l.jr    r30
         l.nop
 
+_flush_tlb:
+       /*
+        *  I N V A L I D A T E   T L B   e n t r i e s
+        */
+       LOAD_SYMBOL_2_GPR(r5,SPR_DTLBMR_BASE(0))
+       LOAD_SYMBOL_2_GPR(r6,SPR_ITLBMR_BASE(0))
+       l.addi  r7,r0,128 /* Maximum number of sets */
+1:
+       l.mtspr r5,r0,0x0
+       l.mtspr r6,r0,0x0
+
+       l.addi  r5,r5,1
+       l.addi  r6,r6,1
+       l.sfeq  r7,r0
+       l.bnf   1b
+        l.addi r7,r7,-1
+
+       l.jr    r9
+        l.nop
+
 /* ========================================[ cache ]=== */
 
        /* aligment here so we don't change memory offsets with
@@ -971,8 +982,6 @@ ENTRY(dtlb_miss_handler)
        EXCEPTION_STORE_GPR2
        EXCEPTION_STORE_GPR3
        EXCEPTION_STORE_GPR4
-       EXCEPTION_STORE_GPR5
-       EXCEPTION_STORE_GPR6
        /*
         * get EA of the miss
         */
@@ -980,91 +989,70 @@ ENTRY(dtlb_miss_handler)
        /*
         * pmd = (pmd_t *)(current_pgd + pgd_index(daddr));
         */
-       GET_CURRENT_PGD(r3,r5)          // r3 is current_pgd, r5 is temp
+       GET_CURRENT_PGD(r3,r4)          // r3 is current_pgd, r4 is temp
        l.srli  r4,r2,0x18              // >> PAGE_SHIFT + (PAGE_SHIFT - 2)
        l.slli  r4,r4,0x2               // to get address << 2
-       l.add   r5,r4,r3                // r4 is pgd_index(daddr)
+       l.add   r3,r4,r3                // r4 is pgd_index(daddr)
        /*
         * if (pmd_none(*pmd))
         *   goto pmd_none:
         */
-       tophys  (r4,r5)
+       tophys  (r4,r3)
        l.lwz   r3,0x0(r4)              // get *pmd value
        l.sfne  r3,r0
        l.bnf   d_pmd_none
-        l.andi r3,r3,~PAGE_MASK //0x1fff               // ~PAGE_MASK
-       /*
-        * if (pmd_bad(*pmd))
-        *   pmd_clear(pmd)
-        *   goto pmd_bad:
-        */
-//     l.sfeq  r3,r0                   // check *pmd value
-//     l.bf    d_pmd_good
-       l.addi  r3,r0,0xffffe000        // PAGE_MASK
-//     l.j     d_pmd_bad
-//     l.sw    0x0(r4),r0              // clear pmd
+        l.addi r3,r0,0xffffe000        // PAGE_MASK
+
 d_pmd_good:
        /*
         * pte = *pte_offset(pmd, daddr);
         */
        l.lwz   r4,0x0(r4)              // get **pmd value
        l.and   r4,r4,r3                // & PAGE_MASK
-       l.srli  r5,r2,0xd               // >> PAGE_SHIFT, r2 == EEAR
-       l.andi  r3,r5,0x7ff             // (1UL << PAGE_SHIFT - 2) - 1
+       l.srli  r2,r2,0xd               // >> PAGE_SHIFT, r2 == EEAR
+       l.andi  r3,r2,0x7ff             // (1UL << PAGE_SHIFT - 2) - 1
        l.slli  r3,r3,0x2               // to get address << 2
        l.add   r3,r3,r4
-       l.lwz   r2,0x0(r3)              // this is pte at last
+       l.lwz   r3,0x0(r3)              // this is pte at last
        /*
         * if (!pte_present(pte))
         */
-       l.andi  r4,r2,0x1
+       l.andi  r4,r3,0x1
        l.sfne  r4,r0                   // is pte present
        l.bnf   d_pte_not_present
-       l.addi  r3,r0,0xffffe3fa        // PAGE_MASK | DTLB_UP_CONVERT_MASK
+       l.addi  r4,r0,0xffffe3fa        // PAGE_MASK | DTLB_UP_CONVERT_MASK
        /*
         * fill DTLB TR register
         */
-       l.and   r4,r2,r3                // apply the mask
+       l.and   r4,r3,r4                // apply the mask
        // Determine number of DMMU sets
-       l.mfspr r6, r0, SPR_DMMUCFGR
-       l.andi  r6, r6, SPR_DMMUCFGR_NTS
-       l.srli  r6, r6, SPR_DMMUCFGR_NTS_OFF
+       l.mfspr r2, r0, SPR_DMMUCFGR
+       l.andi  r2, r2, SPR_DMMUCFGR_NTS
+       l.srli  r2, r2, SPR_DMMUCFGR_NTS_OFF
        l.ori   r3, r0, 0x1
-       l.sll   r3, r3, r6      // r3 = number DMMU sets DMMUCFGR
-       l.addi  r6, r3, -1      // r6 = nsets mask
-       l.and   r5, r5, r6      // calc offset:  & (NUM_TLB_ENTRIES-1)
+       l.sll   r3, r3, r2      // r3 = number DMMU sets DMMUCFGR
+       l.addi  r2, r3, -1      // r2 = nsets mask
+       l.mfspr r3, r0, SPR_EEAR_BASE
+       l.srli  r3, r3, 0xd     // >> PAGE_SHIFT
+       l.and   r2, r3, r2      // calc offset:  & (NUM_TLB_ENTRIES-1)
                                                           //NUM_TLB_ENTRIES
-       l.mtspr r5,r4,SPR_DTLBTR_BASE(0)
+       l.mtspr r2,r4,SPR_DTLBTR_BASE(0)
        /*
         * fill DTLB MR register
         */
-       l.mfspr r2,r0,SPR_EEAR_BASE
-       l.addi  r3,r0,0xffffe000        // PAGE_MASK
-       l.and   r4,r2,r3                // apply PAGE_MASK to EA (__PHX__ do we really need this?)
-       l.ori   r4,r4,0x1               // set hardware valid bit: DTBL_MR entry
-       l.mtspr r5,r4,SPR_DTLBMR_BASE(0)
+       l.slli  r3, r3, 0xd             /* << PAGE_SHIFT => EA & PAGE_MASK */
+       l.ori   r4,r3,0x1               // set hardware valid bit: DTBL_MR entry
+       l.mtspr r2,r4,SPR_DTLBMR_BASE(0)
 
        EXCEPTION_LOAD_GPR2
        EXCEPTION_LOAD_GPR3
        EXCEPTION_LOAD_GPR4
-       EXCEPTION_LOAD_GPR5
-       EXCEPTION_LOAD_GPR6
-       l.rfe
-d_pmd_bad:
-       l.nop   1
-       EXCEPTION_LOAD_GPR2
-       EXCEPTION_LOAD_GPR3
-       EXCEPTION_LOAD_GPR4
-       EXCEPTION_LOAD_GPR5
-       EXCEPTION_LOAD_GPR6
        l.rfe
 d_pmd_none:
 d_pte_not_present:
        EXCEPTION_LOAD_GPR2
        EXCEPTION_LOAD_GPR3
        EXCEPTION_LOAD_GPR4
-       EXCEPTION_LOAD_GPR5
-       EXCEPTION_LOAD_GPR6
        EXCEPTION_HANDLE(_dtlb_miss_page_fault_handler)
 
 /* ==============================================[ ITLB miss handler ]=== */
@@ -1072,8 +1060,6 @@ ENTRY(itlb_miss_handler)
        EXCEPTION_STORE_GPR2
        EXCEPTION_STORE_GPR3
        EXCEPTION_STORE_GPR4
-       EXCEPTION_STORE_GPR5
-       EXCEPTION_STORE_GPR6
        /*
         * get EA of the miss
         */
@@ -1083,30 +1069,19 @@ ENTRY(itlb_miss_handler)
         * pmd = (pmd_t *)(current_pgd + pgd_index(daddr));
         *
         */
-       GET_CURRENT_PGD(r3,r5)          // r3 is current_pgd, r5 is temp
+       GET_CURRENT_PGD(r3,r4)          // r3 is current_pgd, r5 is temp
        l.srli  r4,r2,0x18              // >> PAGE_SHIFT + (PAGE_SHIFT - 2)
        l.slli  r4,r4,0x2               // to get address << 2
-       l.add   r5,r4,r3                // r4 is pgd_index(daddr)
+       l.add   r3,r4,r3                // r4 is pgd_index(daddr)
        /*
         * if (pmd_none(*pmd))
         *   goto pmd_none:
         */
-       tophys  (r4,r5)
+       tophys  (r4,r3)
        l.lwz   r3,0x0(r4)              // get *pmd value
        l.sfne  r3,r0
        l.bnf   i_pmd_none
-       l.andi  r3,r3,0x1fff            // ~PAGE_MASK
-       /*
-        * if (pmd_bad(*pmd))
-        *   pmd_clear(pmd)
-        *   goto pmd_bad:
-        */
-
-//     l.sfeq  r3,r0                   // check *pmd value
-//     l.bf    i_pmd_good
-       l.addi  r3,r0,0xffffe000        // PAGE_MASK
-//     l.j     i_pmd_bad
-//     l.sw    0x0(r4),r0              // clear pmd
+        l.addi r3,r0,0xffffe000        // PAGE_MASK
 
 i_pmd_good:
        /*
@@ -1115,35 +1090,36 @@ i_pmd_good:
         */
        l.lwz   r4,0x0(r4)              // get **pmd value
        l.and   r4,r4,r3                // & PAGE_MASK
-       l.srli  r5,r2,0xd               // >> PAGE_SHIFT, r2 == EEAR
-       l.andi  r3,r5,0x7ff             // (1UL << PAGE_SHIFT - 2) - 1
+       l.srli  r2,r2,0xd               // >> PAGE_SHIFT, r2 == EEAR
+       l.andi  r3,r2,0x7ff             // (1UL << PAGE_SHIFT - 2) - 1
        l.slli  r3,r3,0x2               // to get address << 2
        l.add   r3,r3,r4
-       l.lwz   r2,0x0(r3)              // this is pte at last
+       l.lwz   r3,0x0(r3)              // this is pte at last
        /*
         * if (!pte_present(pte))
         *
         */
-       l.andi  r4,r2,0x1
+       l.andi  r4,r3,0x1
        l.sfne  r4,r0                   // is pte present
        l.bnf   i_pte_not_present
-       l.addi  r3,r0,0xffffe03a        // PAGE_MASK | ITLB_UP_CONVERT_MASK
+        l.addi r4,r0,0xffffe03a        // PAGE_MASK | ITLB_UP_CONVERT_MASK
        /*
         * fill ITLB TR register
         */
-       l.and   r4,r2,r3                // apply the mask
-       l.andi  r3,r2,0x7c0             // _PAGE_EXEC | _PAGE_SRE | _PAGE_SWE |  _PAGE_URE | _PAGE_UWE
-//     l.andi  r3,r2,0x400             // _PAGE_EXEC
+       l.and   r4,r3,r4                // apply the mask
+       l.andi  r3,r3,0x7c0             // _PAGE_EXEC | _PAGE_SRE | _PAGE_SWE |  _PAGE_URE | _PAGE_UWE
        l.sfeq  r3,r0
        l.bf    itlb_tr_fill //_workaround
        // Determine number of IMMU sets
-       l.mfspr r6, r0, SPR_IMMUCFGR
-       l.andi  r6, r6, SPR_IMMUCFGR_NTS
-       l.srli  r6, r6, SPR_IMMUCFGR_NTS_OFF
+       l.mfspr r2, r0, SPR_IMMUCFGR
+       l.andi  r2, r2, SPR_IMMUCFGR_NTS
+       l.srli  r2, r2, SPR_IMMUCFGR_NTS_OFF
        l.ori   r3, r0, 0x1
-       l.sll   r3, r3, r6      // r3 = number IMMU sets IMMUCFGR
-       l.addi  r6, r3, -1      // r6 = nsets mask
-       l.and   r5, r5, r6      // calc offset:  & (NUM_TLB_ENTRIES-1)
+       l.sll   r3, r3, r2      // r3 = number IMMU sets IMMUCFGR
+       l.addi  r2, r3, -1      // r2 = nsets mask
+       l.mfspr r3, r0, SPR_EEAR_BASE
+       l.srli  r3, r3, 0xd     // >> PAGE_SHIFT
+       l.and   r2, r3, r2      // calc offset:  & (NUM_TLB_ENTRIES-1)
 
 /*
  * __PHX__ :: fixme
@@ -1155,38 +1131,24 @@ i_pmd_good:
 itlb_tr_fill_workaround:
        l.ori   r4,r4,0xc0              // | (SPR_ITLBTR_UXE | ITLBTR_SXE)
 itlb_tr_fill:
-       l.mtspr r5,r4,SPR_ITLBTR_BASE(0)
+       l.mtspr r2,r4,SPR_ITLBTR_BASE(0)
        /*
         * fill DTLB MR register
         */
-       l.mfspr r2,r0,SPR_EEAR_BASE
-       l.addi  r3,r0,0xffffe000        // PAGE_MASK
-       l.and   r4,r2,r3                // apply PAGE_MASK to EA (__PHX__ do we really need this?)
-       l.ori   r4,r4,0x1               // set hardware valid bit: DTBL_MR entry
-       l.mtspr r5,r4,SPR_ITLBMR_BASE(0)
+       l.slli  r3, r3, 0xd             /* << PAGE_SHIFT => EA & PAGE_MASK */
+       l.ori   r4,r3,0x1               // set hardware valid bit: ITBL_MR entry
+       l.mtspr r2,r4,SPR_ITLBMR_BASE(0)
 
        EXCEPTION_LOAD_GPR2
        EXCEPTION_LOAD_GPR3
        EXCEPTION_LOAD_GPR4
-       EXCEPTION_LOAD_GPR5
-       EXCEPTION_LOAD_GPR6
        l.rfe
 
-i_pmd_bad:
-       l.nop   1
-       EXCEPTION_LOAD_GPR2
-       EXCEPTION_LOAD_GPR3
-       EXCEPTION_LOAD_GPR4
-       EXCEPTION_LOAD_GPR5
-       EXCEPTION_LOAD_GPR6
-       l.rfe
 i_pmd_none:
 i_pte_not_present:
        EXCEPTION_LOAD_GPR2
        EXCEPTION_LOAD_GPR3
        EXCEPTION_LOAD_GPR4
-       EXCEPTION_LOAD_GPR5
-       EXCEPTION_LOAD_GPR6
        EXCEPTION_HANDLE(_itlb_miss_page_fault_handler)
 
 /* ==============================================[ boot tlb handlers ]=== */
@@ -1571,12 +1533,7 @@ ENTRY(_early_uart_init)
        l.jr    r9
        l.nop
 
-_string_copying_linux:
-       .string "\n\n\n\n\n\rCopying Linux... \0"
-
-_string_ok_booting:
-       .string "Ok, booting the kernel.\n\r\0"
-
+       .section .rodata
 _string_unhandled_exception:
        .string "\n\rRunarunaround: Unhandled exception 0x\0"
 
@@ -1586,11 +1543,6 @@ _string_epc_prefix:
 _string_nl:
        .string "\n\r\0"
 
-       .global _string_esr_irq_bug
-_string_esr_irq_bug:
-       .string "\n\rESR external interrupt bug, for details look into entry.S\n\r\0"
-
-
 
 /* ========================================[ page aligned structures ]=== */
 
index 86e31cf1de1d1d3c43e671b9337c9917219fa6e9..5c4695d13542fc003054995b728ac468e18bd94c 100644 (file)
@@ -44,3 +44,4 @@ DECLARE_EXPORT(__ashldi3);
 DECLARE_EXPORT(__lshrdi3);
 
 EXPORT_SYMBOL(__copy_tofrom_user);
+EXPORT_SYMBOL(memset);
index d7990df9025a6e1f77c1e98668a625700c323bea..6e9d1cb519f245777ada914e1a9e009ed5d7980a 100644 (file)
@@ -75,6 +75,17 @@ void machine_power_off(void)
        __asm__("l.nop 1");
 }
 
+/*
+ * Send the doze signal to the cpu if available.
+ * Make sure, that all interrupts are enabled
+ */
+void arch_cpu_idle(void)
+{
+       local_irq_enable();
+       if (mfspr(SPR_UPR) & SPR_UPR_PMP)
+               mtspr(SPR_PMR, mfspr(SPR_PMR) | SPR_PMR_DME);
+}
+
 void (*pm_power_off) (void) = machine_power_off;
 
 /*
@@ -226,6 +237,7 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t * fpu)
 
 extern struct thread_info *_switch(struct thread_info *old_ti,
                                   struct thread_info *new_ti);
+extern int lwa_flag;
 
 struct task_struct *__switch_to(struct task_struct *old,
                                struct task_struct *new)
@@ -243,6 +255,8 @@ struct task_struct *__switch_to(struct task_struct *old,
        new_ti = new->stack;
        old_ti = old->stack;
 
+       lwa_flag = 0;
+
        current_thread_info_set[smp_processor_id()] = new_ti;
        last = (_switch(old_ti, new_ti))->task;
 
index 4f59fa4e34e5f2c795c3d1e150520513dd78d7d3..228288887d74facd56c89f1f71e102e940ae540e 100644 (file)
@@ -16,7 +16,6 @@
  *      2 of the License, or (at your option) any later version.
  */
 
-#include <stddef.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/string.h>
index cb797a3beb47740717528a042c5205dd265fc375..dbf5ee95a0d5f2ba8e2e6f453234afa954e04f45 100644 (file)
@@ -117,13 +117,15 @@ static void print_cpuinfo(void)
        if (upr & SPR_UPR_DCP)
                printk(KERN_INFO
                       "-- dcache: %4d bytes total, %2d bytes/line, %d way(s)\n",
-                      cpuinfo.dcache_size, cpuinfo.dcache_block_size, 1);
+                      cpuinfo.dcache_size, cpuinfo.dcache_block_size,
+                      cpuinfo.dcache_ways);
        else
                printk(KERN_INFO "-- dcache disabled\n");
        if (upr & SPR_UPR_ICP)
                printk(KERN_INFO
                       "-- icache: %4d bytes total, %2d bytes/line, %d way(s)\n",
-                      cpuinfo.icache_size, cpuinfo.icache_block_size, 1);
+                      cpuinfo.icache_size, cpuinfo.icache_block_size,
+                      cpuinfo.icache_ways);
        else
                printk(KERN_INFO "-- icache disabled\n");
 
@@ -155,25 +157,25 @@ void __init setup_cpuinfo(void)
 {
        struct device_node *cpu;
        unsigned long iccfgr, dccfgr;
-       unsigned long cache_set_size, cache_ways;
+       unsigned long cache_set_size;
 
        cpu = of_find_compatible_node(NULL, NULL, "opencores,or1200-rtlsvn481");
        if (!cpu)
                panic("No compatible CPU found in device tree...\n");
 
        iccfgr = mfspr(SPR_ICCFGR);
-       cache_ways = 1 << (iccfgr & SPR_ICCFGR_NCW);
+       cpuinfo.icache_ways = 1 << (iccfgr & SPR_ICCFGR_NCW);
        cache_set_size = 1 << ((iccfgr & SPR_ICCFGR_NCS) >> 3);
        cpuinfo.icache_block_size = 16 << ((iccfgr & SPR_ICCFGR_CBS) >> 7);
        cpuinfo.icache_size =
-           cache_set_size * cache_ways * cpuinfo.icache_block_size;
+           cache_set_size * cpuinfo.icache_ways * cpuinfo.icache_block_size;
 
        dccfgr = mfspr(SPR_DCCFGR);
-       cache_ways = 1 << (dccfgr & SPR_DCCFGR_NCW);
+       cpuinfo.dcache_ways = 1 << (dccfgr & SPR_DCCFGR_NCW);
        cache_set_size = 1 << ((dccfgr & SPR_DCCFGR_NCS) >> 3);
        cpuinfo.dcache_block_size = 16 << ((dccfgr & SPR_DCCFGR_CBS) >> 7);
        cpuinfo.dcache_size =
-           cache_set_size * cache_ways * cpuinfo.dcache_block_size;
+           cache_set_size * cpuinfo.dcache_ways * cpuinfo.dcache_block_size;
 
        if (of_property_read_u32(cpu, "clock-frequency",
                                 &cpuinfo.clock_frequency)) {
@@ -308,30 +310,33 @@ static int show_cpuinfo(struct seq_file *m, void *v)
        revision = vr & SPR_VR_REV;
 
        seq_printf(m,
-                  "cpu\t\t: OpenRISC-%x\n"
-                  "revision\t: %d\n"
-                  "frequency\t: %ld\n"
-                  "dcache size\t: %d bytes\n"
-                  "dcache block size\t: %d bytes\n"
-                  "icache size\t: %d bytes\n"
-                  "icache block size\t: %d bytes\n"
-                  "immu\t\t: %d entries, %lu ways\n"
-                  "dmmu\t\t: %d entries, %lu ways\n"
-                  "bogomips\t: %lu.%02lu\n",
-                  version,
-                  revision,
-                  loops_per_jiffy * HZ,
-                  cpuinfo.dcache_size,
-                  cpuinfo.dcache_block_size,
-                  cpuinfo.icache_size,
-                  cpuinfo.icache_block_size,
-                  1 << ((mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTS) >> 2),
-                  1 + (mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTW),
-                  1 << ((mfspr(SPR_IMMUCFGR) & SPR_IMMUCFGR_NTS) >> 2),
-                  1 + (mfspr(SPR_IMMUCFGR) & SPR_IMMUCFGR_NTW),
-                  (loops_per_jiffy * HZ) / 500000,
-                  ((loops_per_jiffy * HZ) / 5000) % 100);
-
+                 "cpu\t\t: OpenRISC-%x\n"
+                 "revision\t: %d\n"
+                 "frequency\t: %ld\n"
+                 "dcache size\t: %d bytes\n"
+                 "dcache block size\t: %d bytes\n"
+                 "dcache ways\t: %d\n"
+                 "icache size\t: %d bytes\n"
+                 "icache block size\t: %d bytes\n"
+                 "icache ways\t: %d\n"
+                 "immu\t\t: %d entries, %lu ways\n"
+                 "dmmu\t\t: %d entries, %lu ways\n"
+                 "bogomips\t: %lu.%02lu\n",
+                 version,
+                 revision,
+                 loops_per_jiffy * HZ,
+                 cpuinfo.dcache_size,
+                 cpuinfo.dcache_block_size,
+                 cpuinfo.dcache_ways,
+                 cpuinfo.icache_size,
+                 cpuinfo.icache_block_size,
+                 cpuinfo.icache_ways,
+                 1 << ((mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTS) >> 2),
+                 1 + (mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTW),
+                 1 << ((mfspr(SPR_IMMUCFGR) & SPR_IMMUCFGR_NTS) >> 2),
+                 1 + (mfspr(SPR_IMMUCFGR) & SPR_IMMUCFGR_NTW),
+                 (loops_per_jiffy * HZ) / 500000,
+                 ((loops_per_jiffy * HZ) / 5000) % 100);
        return 0;
 }
 
index d29c41bfbffaab232cf21d781aed6893fde12c06..7e81ad258bca39bf20867060cf012bd24bed346e 100644 (file)
@@ -40,6 +40,8 @@
 extern char _etext, _stext;
 
 int kstack_depth_to_print = 0x180;
+int lwa_flag;
+unsigned long __user *lwa_addr;
 
 static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
 {
@@ -334,10 +336,191 @@ asmlinkage void do_bus_fault(struct pt_regs *regs, unsigned long address)
        }
 }
 
+static inline int in_delay_slot(struct pt_regs *regs)
+{
+#ifdef CONFIG_OPENRISC_NO_SPR_SR_DSX
+       /* No delay slot flag, do the old way */
+       unsigned int op, insn;
+
+       insn = *((unsigned int *)regs->pc);
+       op = insn >> 26;
+       switch (op) {
+       case 0x00: /* l.j */
+       case 0x01: /* l.jal */
+       case 0x03: /* l.bnf */
+       case 0x04: /* l.bf */
+       case 0x11: /* l.jr */
+       case 0x12: /* l.jalr */
+               return 1;
+       default:
+               return 0;
+       }
+#else
+       return regs->sr & SPR_SR_DSX;
+#endif
+}
+
+static inline void adjust_pc(struct pt_regs *regs, unsigned long address)
+{
+       int displacement;
+       unsigned int rb, op, jmp;
+
+       if (unlikely(in_delay_slot(regs))) {
+               /* In delay slot, instruction at pc is a branch, simulate it */
+               jmp = *((unsigned int *)regs->pc);
+
+               displacement = sign_extend32(((jmp) & 0x3ffffff) << 2, 27);
+               rb = (jmp & 0x0000ffff) >> 11;
+               op = jmp >> 26;
+
+               switch (op) {
+               case 0x00: /* l.j */
+                       regs->pc += displacement;
+                       return;
+               case 0x01: /* l.jal */
+                       regs->pc += displacement;
+                       regs->gpr[9] = regs->pc + 8;
+                       return;
+               case 0x03: /* l.bnf */
+                       if (regs->sr & SPR_SR_F)
+                               regs->pc += 8;
+                       else
+                               regs->pc += displacement;
+                       return;
+               case 0x04: /* l.bf */
+                       if (regs->sr & SPR_SR_F)
+                               regs->pc += displacement;
+                       else
+                               regs->pc += 8;
+                       return;
+               case 0x11: /* l.jr */
+                       regs->pc = regs->gpr[rb];
+                       return;
+               case 0x12: /* l.jalr */
+                       regs->pc = regs->gpr[rb];
+                       regs->gpr[9] = regs->pc + 8;
+                       return;
+               default:
+                       break;
+               }
+       } else {
+               regs->pc += 4;
+       }
+}
+
+static inline void simulate_lwa(struct pt_regs *regs, unsigned long address,
+                               unsigned int insn)
+{
+       unsigned int ra, rd;
+       unsigned long value;
+       unsigned long orig_pc;
+       long imm;
+
+       const struct exception_table_entry *entry;
+
+       orig_pc = regs->pc;
+       adjust_pc(regs, address);
+
+       ra = (insn >> 16) & 0x1f;
+       rd = (insn >> 21) & 0x1f;
+       imm = (short)insn;
+       lwa_addr = (unsigned long __user *)(regs->gpr[ra] + imm);
+
+       if ((unsigned long)lwa_addr & 0x3) {
+               do_unaligned_access(regs, address);
+               return;
+       }
+
+       if (get_user(value, lwa_addr)) {
+               if (user_mode(regs)) {
+                       force_sig(SIGSEGV, current);
+                       return;
+               }
+
+               if ((entry = search_exception_tables(orig_pc))) {
+                       regs->pc = entry->fixup;
+                       return;
+               }
+
+               /* kernel access in kernel space, load it directly */
+               value = *((unsigned long *)lwa_addr);
+       }
+
+       lwa_flag = 1;
+       regs->gpr[rd] = value;
+}
+
+static inline void simulate_swa(struct pt_regs *regs, unsigned long address,
+                               unsigned int insn)
+{
+       unsigned long __user *vaddr;
+       unsigned long orig_pc;
+       unsigned int ra, rb;
+       long imm;
+
+       const struct exception_table_entry *entry;
+
+       orig_pc = regs->pc;
+       adjust_pc(regs, address);
+
+       ra = (insn >> 16) & 0x1f;
+       rb = (insn >> 11) & 0x1f;
+       imm = (short)(((insn & 0x2200000) >> 10) | (insn & 0x7ff));
+       vaddr = (unsigned long __user *)(regs->gpr[ra] + imm);
+
+       if (!lwa_flag || vaddr != lwa_addr) {
+               regs->sr &= ~SPR_SR_F;
+               return;
+       }
+
+       if ((unsigned long)vaddr & 0x3) {
+               do_unaligned_access(regs, address);
+               return;
+       }
+
+       if (put_user(regs->gpr[rb], vaddr)) {
+               if (user_mode(regs)) {
+                       force_sig(SIGSEGV, current);
+                       return;
+               }
+
+               if ((entry = search_exception_tables(orig_pc))) {
+                       regs->pc = entry->fixup;
+                       return;
+               }
+
+               /* kernel access in kernel space, store it directly */
+               *((unsigned long *)vaddr) = regs->gpr[rb];
+       }
+
+       lwa_flag = 0;
+       regs->sr |= SPR_SR_F;
+}
+
+#define INSN_LWA       0x1b
+#define INSN_SWA       0x33
+
 asmlinkage void do_illegal_instruction(struct pt_regs *regs,
                                       unsigned long address)
 {
        siginfo_t info;
+       unsigned int op;
+       unsigned int insn = *((unsigned int *)address);
+
+       op = insn >> 26;
+
+       switch (op) {
+       case INSN_LWA:
+               simulate_lwa(regs, address, insn);
+               return;
+
+       case INSN_SWA:
+               simulate_swa(regs, address, insn);
+               return;
+
+       default:
+               break;
+       }
 
        if (user_mode(regs)) {
                /* Send a SIGILL */
index 966f65dbc6f013ed02fb59233535e1b432cfabf2..17d9d37f32d2ec3cd7086548f0d4190010897a99 100644 (file)
@@ -2,4 +2,4 @@
 # Makefile for or32 specific library files..
 #
 
-obj-y  = string.o delay.o
+obj-y  := delay.o string.o memset.o memcpy.o
diff --git a/arch/openrisc/lib/memcpy.c b/arch/openrisc/lib/memcpy.c
new file mode 100644 (file)
index 0000000..669887a
--- /dev/null
@@ -0,0 +1,124 @@
+/*
+ * arch/openrisc/lib/memcpy.c
+ *
+ * Optimized memory copy routines for openrisc.  These are mostly copied
+ * from ohter sources but slightly entended based on ideas discuassed in
+ * #openrisc.
+ *
+ * The word unroll implementation is an extension to the arm byte
+ * unrolled implementation, but using word copies (if things are
+ * properly aligned)
+ *
+ * The great arm loop unroll algorithm can be found at:
+ *  arch/arm/boot/compressed/string.c
+ */
+
+#include <linux/export.h>
+
+#include <linux/string.h>
+
+#ifdef CONFIG_OR1K_1200
+/*
+ * Do memcpy with word copies and loop unrolling. This gives the
+ * best performance on the OR1200 and MOR1KX archirectures
+ */
+void *memcpy(void *dest, __const void *src, __kernel_size_t n)
+{
+       int i = 0;
+       unsigned char *d, *s;
+       uint32_t *dest_w = (uint32_t *)dest, *src_w = (uint32_t *)src;
+
+       /* If both source and dest are word aligned copy words */
+       if (!((unsigned int)dest_w & 3) && !((unsigned int)src_w & 3)) {
+               /* Copy 32 bytes per loop */
+               for (i = n >> 5; i > 0; i--) {
+                       *dest_w++ = *src_w++;
+                       *dest_w++ = *src_w++;
+                       *dest_w++ = *src_w++;
+                       *dest_w++ = *src_w++;
+                       *dest_w++ = *src_w++;
+                       *dest_w++ = *src_w++;
+                       *dest_w++ = *src_w++;
+                       *dest_w++ = *src_w++;
+               }
+
+               if (n & 1 << 4) {
+                       *dest_w++ = *src_w++;
+                       *dest_w++ = *src_w++;
+                       *dest_w++ = *src_w++;
+                       *dest_w++ = *src_w++;
+               }
+
+               if (n & 1 << 3) {
+                       *dest_w++ = *src_w++;
+                       *dest_w++ = *src_w++;
+               }
+
+               if (n & 1 << 2)
+                       *dest_w++ = *src_w++;
+
+               d = (unsigned char *)dest_w;
+               s = (unsigned char *)src_w;
+
+       } else {
+               d = (unsigned char *)dest_w;
+               s = (unsigned char *)src_w;
+
+               for (i = n >> 3; i > 0; i--) {
+                       *d++ = *s++;
+                       *d++ = *s++;
+                       *d++ = *s++;
+                       *d++ = *s++;
+                       *d++ = *s++;
+                       *d++ = *s++;
+                       *d++ = *s++;
+                       *d++ = *s++;
+               }
+
+               if (n & 1 << 2) {
+                       *d++ = *s++;
+                       *d++ = *s++;
+                       *d++ = *s++;
+                       *d++ = *s++;
+               }
+       }
+
+       if (n & 1 << 1) {
+               *d++ = *s++;
+               *d++ = *s++;
+       }
+
+       if (n & 1)
+               *d++ = *s++;
+
+       return dest;
+}
+#else
+/*
+ * Use word copies but no loop unrolling as we cannot assume there
+ * will be benefits on the archirecture
+ */
+void *memcpy(void *dest, __const void *src, __kernel_size_t n)
+{
+       unsigned char *d = (unsigned char *)dest, *s = (unsigned char *)src;
+       uint32_t *dest_w = (uint32_t *)dest, *src_w = (uint32_t *)src;
+
+       /* If both source and dest are word aligned copy words */
+       if (!((unsigned int)dest_w & 3) && !((unsigned int)src_w & 3)) {
+               for (; n >= 4; n -= 4)
+                       *dest_w++ = *src_w++;
+       }
+
+       d = (unsigned char *)dest_w;
+       s = (unsigned char *)src_w;
+
+       /* For remaining or if not aligned, copy bytes */
+       for (; n >= 1; n -= 1)
+               *d++ = *s++;
+
+       return dest;
+
+}
+#endif
+
+EXPORT_SYMBOL(memcpy);
diff --git a/arch/openrisc/lib/memset.S b/arch/openrisc/lib/memset.S
new file mode 100644 (file)
index 0000000..92cc2ea
--- /dev/null
@@ -0,0 +1,98 @@
+/*
+ * OpenRISC memset.S
+ *
+ * Hand-optimized assembler version of memset for OpenRISC.
+ * Algorithm inspired by several other arch-specific memset routines
+ * in the kernel tree
+ *
+ * Copyright (C) 2015 Olof Kindgren <olof.kindgren@gmail.com>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+       .global memset
+       .type   memset, @function
+memset:
+       /* arguments:
+        * r3 = *s
+        * r4 = c
+        * r5 = n
+        * r13, r15, r17, r19 used as temp regs
+       */
+
+       /* Exit if n == 0 */
+       l.sfeqi         r5, 0
+       l.bf            4f
+
+       /* Truncate c to char */
+       l.andi          r13, r4, 0xff
+
+       /* Skip word extension if c is 0 */
+       l.sfeqi         r13, 0
+       l.bf            1f
+       /* Check for at least two whole words (8 bytes) */
+        l.sfleui       r5, 7
+
+       /* Extend char c to 32-bit word cccc in r13 */
+       l.slli          r15, r13, 16  // r13 = 000c, r15 = 0c00
+       l.or            r13, r13, r15 // r13 = 0c0c, r15 = 0c00
+       l.slli          r15, r13, 8   // r13 = 0c0c, r15 = c0c0
+       l.or            r13, r13, r15 // r13 = cccc, r15 = c0c0
+
+1:     l.addi          r19, r3, 0 // Set r19 = src
+       /* Jump to byte copy loop if less than two words */
+       l.bf            3f
+        l.or           r17, r5, r0 // Set r17 = n
+
+       /* Mask out two LSBs to check alignment */
+       l.andi          r15, r3, 0x3
+
+       /* lsb == 00, jump to word copy loop */
+       l.sfeqi         r15, 0
+       l.bf            2f
+        l.addi         r19, r3, 0 // Set r19 = src
+
+       /* lsb == 01,10 or 11 */
+       l.sb            0(r3), r13   // *src = c
+       l.addi          r17, r17, -1 // Decrease n
+
+       l.sfeqi         r15, 3
+       l.bf            2f
+        l.addi         r19, r3, 1  // src += 1
+
+       /* lsb == 01 or 10 */
+       l.sb            1(r3), r13   // *(src+1) = c
+       l.addi          r17, r17, -1 // Decrease n
+
+       l.sfeqi         r15, 2
+       l.bf            2f
+        l.addi         r19, r3, 2  // src += 2
+
+       /* lsb == 01 */
+       l.sb            2(r3), r13   // *(src+2) = c
+       l.addi          r17, r17, -1 // Decrease n
+       l.addi          r19, r3, 3   // src += 3
+
+       /* Word copy loop */
+2:     l.sw            0(r19), r13  // *src = cccc
+       l.addi          r17, r17, -4 // Decrease n
+       l.sfgeui        r17, 4
+       l.bf            2b
+        l.addi         r19, r19, 4  // Increase src
+
+       /* When n > 0, copy the remaining bytes, otherwise jump to exit */
+       l.sfeqi         r17, 0
+       l.bf            4f
+
+       /* Byte copy loop */
+3:     l.addi          r17, r17, -1 // Decrease n
+       l.sb            0(r19), r13  // *src = cccc
+       l.sfnei         r17, 0
+       l.bf            3b
+        l.addi         r19, r19, 1  // Increase src
+
+4:     l.jr            r9
+        l.ori          r11, r3, 0
index 8705a46218f9273263a0678db7647b1925edf4b7..2175e4bfd9fc0a28e80df5dca135493ec3728720 100644 (file)
@@ -80,6 +80,7 @@ __ioremap(phys_addr_t addr, unsigned long size, pgprot_t prot)
 
        return (void __iomem *)(offset + (char *)v);
 }
+EXPORT_SYMBOL(__ioremap);
 
 void iounmap(void *addr)
 {
@@ -106,6 +107,7 @@ void iounmap(void *addr)
 
        return vfree((void *)(PAGE_MASK & (unsigned long)addr));
 }
+EXPORT_SYMBOL(iounmap);
 
 /**
  * OK, this one's a bit tricky... ioremap can get called before memory is
index c1263fc390db6d2f1672c9380d5a63d9b881d066..f294dd42fc7d3833ccc5b92fa6077e3b7d869d26 100644 (file)
@@ -17,7 +17,8 @@
 
 #define HPAGE_SHIFT            23
 #define REAL_HPAGE_SHIFT       22
-
+#define HPAGE_256MB_SHIFT      28
+#define HPAGE_64K_SHIFT                16
 #define REAL_HPAGE_SIZE                (_AC(1,UL) << REAL_HPAGE_SHIFT)
 
 #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
@@ -26,6 +27,7 @@
 #define HUGETLB_PAGE_ORDER     (HPAGE_SHIFT - PAGE_SHIFT)
 #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 #define REAL_HPAGE_PER_HPAGE   (_AC(1,UL) << (HPAGE_SHIFT - REAL_HPAGE_SHIFT))
+#define HUGE_MAX_HSTATE                3
 #endif
 
 #ifndef __ASSEMBLY__
index 314b66851348200f03970e7df655127ea87decad..7932a4a378176cfc697ba56758ad605f67bc16fb 100644 (file)
@@ -375,7 +375,10 @@ static inline pgprot_t pgprot_noncached(pgprot_t prot)
 #define pgprot_noncached pgprot_noncached
 
 #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
-static inline unsigned long __pte_huge_mask(void)
+extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
+                               struct page *page, int writable);
+#define arch_make_huge_pte arch_make_huge_pte
+static inline unsigned long __pte_default_huge_mask(void)
 {
        unsigned long mask;
 
@@ -395,12 +398,14 @@ static inline unsigned long __pte_huge_mask(void)
 
 static inline pte_t pte_mkhuge(pte_t pte)
 {
-       return __pte(pte_val(pte) | _PAGE_PMD_HUGE | __pte_huge_mask());
+       return __pte(pte_val(pte) | __pte_default_huge_mask());
 }
 
-static inline bool is_hugetlb_pte(pte_t pte)
+static inline bool is_default_hugetlb_pte(pte_t pte)
 {
-       return !!(pte_val(pte) & __pte_huge_mask());
+       unsigned long mask = __pte_default_huge_mask();
+
+       return (pte_val(pte) & mask) == mask;
 }
 
 static inline bool is_hugetlb_pmd(pmd_t pmd)
@@ -875,10 +880,12 @@ static inline unsigned long pud_pfn(pud_t pud)
 
 /* Actual page table PTE updates.  */
 void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr,
-                  pte_t *ptep, pte_t orig, int fullmm);
+                  pte_t *ptep, pte_t orig, int fullmm,
+                  unsigned int hugepage_shift);
 
 static void maybe_tlb_batch_add(struct mm_struct *mm, unsigned long vaddr,
-                               pte_t *ptep, pte_t orig, int fullmm)
+                               pte_t *ptep, pte_t orig, int fullmm,
+                               unsigned int hugepage_shift)
 {
        /* It is more efficient to let flush_tlb_kernel_range()
         * handle init_mm tlb flushes.
@@ -887,7 +894,7 @@ static void maybe_tlb_batch_add(struct mm_struct *mm, unsigned long vaddr,
         *             and SUN4V pte layout, so this inline test is fine.
         */
        if (likely(mm != &init_mm) && pte_accessible(mm, orig))
-               tlb_batch_add(mm, vaddr, ptep, orig, fullmm);
+               tlb_batch_add(mm, vaddr, ptep, orig, fullmm, hugepage_shift);
 }
 
 #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
@@ -906,7 +913,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
        pte_t orig = *ptep;
 
        *ptep = pte;
-       maybe_tlb_batch_add(mm, addr, ptep, orig, fullmm);
+       maybe_tlb_batch_add(mm, addr, ptep, orig, fullmm, PAGE_SHIFT);
 }
 
 #define set_pte_at(mm,addr,ptep,pte)   \
index 29d64b1758ed2a0ceb795a07c10205e6f923e6c6..478bf6bb4598b345dd7f590beee86f43893e7645 100644 (file)
@@ -59,8 +59,11 @@ extern atomic_t dcpage_flushes;
 extern atomic_t dcpage_flushes_xcall;
 
 extern int sysctl_tsb_ratio;
-#endif
 
+#ifdef CONFIG_SERIAL_SUNHV
+void sunhv_migrate_hvcons_irq(int cpu);
+#endif
+#endif
 void sun_do_break(void);
 extern int stop_a_enabled;
 extern int scons_pwroff;
index a8e192e907003dd855f9bb232dede7ae5eb069f3..54be88a6774c5cc64fd10aa381999a21bb071ec4 100644 (file)
@@ -8,7 +8,7 @@
 #define TLB_BATCH_NR   192
 
 struct tlb_batch {
-       bool huge;
+       unsigned int hugepage_shift;
        struct mm_struct *mm;
        unsigned long tlb_nr;
        unsigned long active;
@@ -17,7 +17,8 @@ struct tlb_batch {
 
 void flush_tsb_kernel_range(unsigned long start, unsigned long end);
 void flush_tsb_user(struct tlb_batch *tb);
-void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr, bool huge);
+void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr,
+                        unsigned int hugepage_shift);
 
 /* TLB flush operations. */
 
index 225543000122777847f686d418c62e6e6b92d1a5..ad5293f89680f4c3dd4405e413e2a2b4966bcd1b 100644 (file)
@@ -4,7 +4,6 @@
 #ifdef CONFIG_NUMA
 
 #include <asm/mmzone.h>
-#include <asm/cpudata.h>
 
 static inline int cpu_to_node(int cpu)
 {
@@ -42,6 +41,9 @@ int __node_distance(int, int);
 #endif /* !(CONFIG_NUMA) */
 
 #ifdef CONFIG_SMP
+
+#include <asm/cpudata.h>
+
 #define topology_physical_package_id(cpu)      (cpu_data(cpu).proc_id)
 #define topology_core_id(cpu)                  (cpu_data(cpu).core_id)
 #define topology_core_cpumask(cpu)             (&cpu_core_sib_map[cpu])
index f87aae5a908e668a9d458a3c45724448b26bb72c..36196c17aff8ed886a4c1f6e11248ea06f8341b7 100644 (file)
@@ -42,8 +42,8 @@ struct arch_uprobe {
 };
 
 struct arch_uprobe_task {
-       u32 saved_tpc;
-       u32 saved_tnpc;
+       u64 saved_tpc;
+       u64 saved_tnpc;
 };
 
 struct task_struct;
index 0ce347f8e4ccf30ae94bb52ec0d4dca4642fa5c6..90a02cb64e20237dc0facbee3d28ecb2599d71fc 100644 (file)
@@ -1443,6 +1443,7 @@ void __irq_entry smp_receive_signal_client(int irq, struct pt_regs *regs)
 
 static void stop_this_cpu(void *dummy)
 {
+       set_cpu_online(smp_processor_id(), false);
        prom_stopself();
 }
 
@@ -1451,9 +1452,15 @@ void smp_send_stop(void)
        int cpu;
 
        if (tlb_type == hypervisor) {
+               int this_cpu = smp_processor_id();
+#ifdef CONFIG_SERIAL_SUNHV
+               sunhv_migrate_hvcons_irq(this_cpu);
+#endif
                for_each_online_cpu(cpu) {
-                       if (cpu == smp_processor_id())
+                       if (cpu == this_cpu)
                                continue;
+
+                       set_cpu_online(cpu, false);
 #ifdef CONFIG_SUN_LDOMS
                        if (ldom_domaining_enabled) {
                                unsigned long hv_err;
index d568c8207af72ffbd15aae8e5f41f77401ba5397..10689cfd0ad40e6b12ae6b148f99ac2f5c7deb64 100644 (file)
@@ -117,26 +117,11 @@ tsb_miss_page_table_walk_sun4v_fastpath:
        /* Valid PTE is now in %g5.  */
 
 #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
-661:   sethi           %uhi(_PAGE_SZALL_4U), %g7
+       sethi           %uhi(_PAGE_PMD_HUGE), %g7
        sllx            %g7, 32, %g7
-       .section        .sun4v_2insn_patch, "ax"
-       .word           661b
-       mov             _PAGE_SZALL_4V, %g7
-       nop
-       .previous
-
-       and             %g5, %g7, %g2
-
-661:   sethi           %uhi(_PAGE_SZHUGE_4U), %g7
-       sllx            %g7, 32, %g7
-       .section        .sun4v_2insn_patch, "ax"
-       .word           661b
-       mov             _PAGE_SZHUGE_4V, %g7
-       nop
-       .previous
 
-       cmp             %g2, %g7
-       bne,pt          %xcc, 60f
+       andcc           %g5, %g7, %g0
+       be,pt           %xcc, 60f
         nop
 
        /* It is a huge page, use huge page TSB entry address we
index 988acc8b1b80a387d9119782f53f1d41dbe53c4e..e98a3f2e8f0f4839c30a61c8bf583d0753a6219c 100644 (file)
@@ -28,6 +28,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *filp,
                                                        unsigned long pgoff,
                                                        unsigned long flags)
 {
+       struct hstate *h = hstate_file(filp);
        unsigned long task_size = TASK_SIZE;
        struct vm_unmapped_area_info info;
 
@@ -38,7 +39,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *filp,
        info.length = len;
        info.low_limit = TASK_UNMAPPED_BASE;
        info.high_limit = min(task_size, VA_EXCLUDE_START);
-       info.align_mask = PAGE_MASK & ~HPAGE_MASK;
+       info.align_mask = PAGE_MASK & ~huge_page_mask(h);
        info.align_offset = 0;
        addr = vm_unmapped_area(&info);
 
@@ -58,6 +59,7 @@ hugetlb_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
                                  const unsigned long pgoff,
                                  const unsigned long flags)
 {
+       struct hstate *h = hstate_file(filp);
        struct mm_struct *mm = current->mm;
        unsigned long addr = addr0;
        struct vm_unmapped_area_info info;
@@ -69,7 +71,7 @@ hugetlb_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
        info.length = len;
        info.low_limit = PAGE_SIZE;
        info.high_limit = mm->mmap_base;
-       info.align_mask = PAGE_MASK & ~HPAGE_MASK;
+       info.align_mask = PAGE_MASK & ~huge_page_mask(h);
        info.align_offset = 0;
        addr = vm_unmapped_area(&info);
 
@@ -94,6 +96,7 @@ unsigned long
 hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags)
 {
+       struct hstate *h = hstate_file(file);
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        unsigned long task_size = TASK_SIZE;
@@ -101,7 +104,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
        if (test_thread_flag(TIF_32BIT))
                task_size = STACK_TOP32;
 
-       if (len & ~HPAGE_MASK)
+       if (len & ~huge_page_mask(h))
                return -EINVAL;
        if (len > task_size)
                return -ENOMEM;
@@ -113,7 +116,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
        }
 
        if (addr) {
-               addr = ALIGN(addr, HPAGE_SIZE);
+               addr = ALIGN(addr, huge_page_size(h));
                vma = find_vma(mm, addr);
                if (task_size - len >= addr &&
                    (!vma || addr + len <= vma->vm_start))
@@ -127,17 +130,141 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
                                pgoff, flags);
 }
 
+static pte_t sun4u_hugepage_shift_to_tte(pte_t entry, unsigned int shift)
+{
+       return entry;
+}
+
+static pte_t sun4v_hugepage_shift_to_tte(pte_t entry, unsigned int shift)
+{
+       unsigned long hugepage_size = _PAGE_SZ4MB_4V;
+
+       pte_val(entry) = pte_val(entry) & ~_PAGE_SZALL_4V;
+
+       switch (shift) {
+       case HPAGE_256MB_SHIFT:
+               hugepage_size = _PAGE_SZ256MB_4V;
+               pte_val(entry) |= _PAGE_PMD_HUGE;
+               break;
+       case HPAGE_SHIFT:
+               pte_val(entry) |= _PAGE_PMD_HUGE;
+               break;
+       case HPAGE_64K_SHIFT:
+               hugepage_size = _PAGE_SZ64K_4V;
+               break;
+       default:
+               WARN_ONCE(1, "unsupported hugepage shift=%u\n", shift);
+       }
+
+       pte_val(entry) = pte_val(entry) | hugepage_size;
+       return entry;
+}
+
+static pte_t hugepage_shift_to_tte(pte_t entry, unsigned int shift)
+{
+       if (tlb_type == hypervisor)
+               return sun4v_hugepage_shift_to_tte(entry, shift);
+       else
+               return sun4u_hugepage_shift_to_tte(entry, shift);
+}
+
+pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
+                        struct page *page, int writeable)
+{
+       unsigned int shift = huge_page_shift(hstate_vma(vma));
+
+       return hugepage_shift_to_tte(entry, shift);
+}
+
+static unsigned int sun4v_huge_tte_to_shift(pte_t entry)
+{
+       unsigned long tte_szbits = pte_val(entry) & _PAGE_SZALL_4V;
+       unsigned int shift;
+
+       switch (tte_szbits) {
+       case _PAGE_SZ256MB_4V:
+               shift = HPAGE_256MB_SHIFT;
+               break;
+       case _PAGE_SZ4MB_4V:
+               shift = REAL_HPAGE_SHIFT;
+               break;
+       case _PAGE_SZ64K_4V:
+               shift = HPAGE_64K_SHIFT;
+               break;
+       default:
+               shift = PAGE_SHIFT;
+               break;
+       }
+       return shift;
+}
+
+static unsigned int sun4u_huge_tte_to_shift(pte_t entry)
+{
+       unsigned long tte_szbits = pte_val(entry) & _PAGE_SZALL_4U;
+       unsigned int shift;
+
+       switch (tte_szbits) {
+       case _PAGE_SZ256MB_4U:
+               shift = HPAGE_256MB_SHIFT;
+               break;
+       case _PAGE_SZ4MB_4U:
+               shift = REAL_HPAGE_SHIFT;
+               break;
+       case _PAGE_SZ64K_4U:
+               shift = HPAGE_64K_SHIFT;
+               break;
+       default:
+               shift = PAGE_SHIFT;
+               break;
+       }
+       return shift;
+}
+
+static unsigned int huge_tte_to_shift(pte_t entry)
+{
+       unsigned long shift;
+
+       if (tlb_type == hypervisor)
+               shift = sun4v_huge_tte_to_shift(entry);
+       else
+               shift = sun4u_huge_tte_to_shift(entry);
+
+       if (shift == PAGE_SHIFT)
+               WARN_ONCE(1, "tto_to_shift: invalid hugepage tte=0x%lx\n",
+                         pte_val(entry));
+
+       return shift;
+}
+
+static unsigned long huge_tte_to_size(pte_t pte)
+{
+       unsigned long size = 1UL << huge_tte_to_shift(pte);
+
+       if (size == REAL_HPAGE_SIZE)
+               size = HPAGE_SIZE;
+       return size;
+}
+
 pte_t *huge_pte_alloc(struct mm_struct *mm,
                        unsigned long addr, unsigned long sz)
 {
        pgd_t *pgd;
        pud_t *pud;
+       pmd_t *pmd;
        pte_t *pte = NULL;
 
        pgd = pgd_offset(mm, addr);
        pud = pud_alloc(mm, pgd, addr);
-       if (pud)
-               pte = (pte_t *)pmd_alloc(mm, pud, addr);
+       if (pud) {
+               pmd = pmd_alloc(mm, pud, addr);
+               if (!pmd)
+                       return NULL;
+
+               if (sz == PMD_SHIFT)
+                       pte = (pte_t *)pmd;
+               else
+                       pte = pte_alloc_map(mm, pmd, addr);
+       }
 
        return pte;
 }
@@ -146,49 +273,83 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
        pgd_t *pgd;
        pud_t *pud;
+       pmd_t *pmd;
        pte_t *pte = NULL;
 
        pgd = pgd_offset(mm, addr);
        if (!pgd_none(*pgd)) {
                pud = pud_offset(pgd, addr);
-               if (!pud_none(*pud))
-                       pte = (pte_t *)pmd_offset(pud, addr);
+               if (!pud_none(*pud)) {
+                       pmd = pmd_offset(pud, addr);
+                       if (!pmd_none(*pmd)) {
+                               if (is_hugetlb_pmd(*pmd))
+                                       pte = (pte_t *)pmd;
+                               else
+                                       pte = pte_offset_map(pmd, addr);
+                       }
+               }
        }
+
        return pte;
 }
 
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
                     pte_t *ptep, pte_t entry)
 {
+       unsigned int i, nptes, orig_shift, shift;
+       unsigned long size;
        pte_t orig;
 
+       size = huge_tte_to_size(entry);
+       shift = size >= HPAGE_SIZE ? PMD_SHIFT : PAGE_SHIFT;
+       nptes = size >> shift;
+
        if (!pte_present(*ptep) && pte_present(entry))
-               mm->context.hugetlb_pte_count++;
+               mm->context.hugetlb_pte_count += nptes;
 
-       addr &= HPAGE_MASK;
+       addr &= ~(size - 1);
        orig = *ptep;
-       *ptep = entry;
+       orig_shift = pte_none(orig) ? PAGE_SHIFT : huge_tte_to_shift(orig);
+
+       for (i = 0; i < nptes; i++)
+               ptep[i] = __pte(pte_val(entry) + (i << shift));
 
-       /* Issue TLB flush at REAL_HPAGE_SIZE boundaries */
-       maybe_tlb_batch_add(mm, addr, ptep, orig, 0);
-       maybe_tlb_batch_add(mm, addr + REAL_HPAGE_SIZE, ptep, orig, 0);
+       maybe_tlb_batch_add(mm, addr, ptep, orig, 0, orig_shift);
+       /* An HPAGE_SIZE'ed page is composed of two REAL_HPAGE_SIZE'ed pages */
+       if (size == HPAGE_SIZE)
+               maybe_tlb_batch_add(mm, addr + REAL_HPAGE_SIZE, ptep, orig, 0,
+                                   orig_shift);
 }
 
 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
                              pte_t *ptep)
 {
+       unsigned int i, nptes, hugepage_shift;
+       unsigned long size;
        pte_t entry;
 
        entry = *ptep;
+       size = huge_tte_to_size(entry);
+       if (size >= HPAGE_SIZE)
+               nptes = size >> PMD_SHIFT;
+       else
+               nptes = size >> PAGE_SHIFT;
+
+       hugepage_shift = pte_none(entry) ? PAGE_SHIFT :
+               huge_tte_to_shift(entry);
+
        if (pte_present(entry))
-               mm->context.hugetlb_pte_count--;
+               mm->context.hugetlb_pte_count -= nptes;
 
-       addr &= HPAGE_MASK;
-       *ptep = __pte(0UL);
+       addr &= ~(size - 1);
+       for (i = 0; i < nptes; i++)
+               ptep[i] = __pte(0UL);
 
-       /* Issue TLB flush at REAL_HPAGE_SIZE boundaries */
-       maybe_tlb_batch_add(mm, addr, ptep, entry, 0);
-       maybe_tlb_batch_add(mm, addr + REAL_HPAGE_SIZE, ptep, entry, 0);
+       maybe_tlb_batch_add(mm, addr, ptep, entry, 0, hugepage_shift);
+       /* An HPAGE_SIZE'ed page is composed of two REAL_HPAGE_SIZE'ed pages */
+       if (size == HPAGE_SIZE)
+               maybe_tlb_batch_add(mm, addr + REAL_HPAGE_SIZE, ptep, entry, 0,
+                                   hugepage_shift);
 
        return entry;
 }
index 5d2f91511c60ce6099bc4e2ce3d383004febaffc..ccd4553289899ee2e585e409189206f67c7e77f2 100644 (file)
@@ -324,6 +324,50 @@ static void __update_mmu_tsb_insert(struct mm_struct *mm, unsigned long tsb_inde
        tsb_insert(tsb, tag, tte);
 }
 
+#ifdef CONFIG_HUGETLB_PAGE
+static int __init setup_hugepagesz(char *string)
+{
+       unsigned long long hugepage_size;
+       unsigned int hugepage_shift;
+       unsigned short hv_pgsz_idx;
+       unsigned int hv_pgsz_mask;
+       int rc = 0;
+
+       hugepage_size = memparse(string, &string);
+       hugepage_shift = ilog2(hugepage_size);
+
+       switch (hugepage_shift) {
+       case HPAGE_256MB_SHIFT:
+               hv_pgsz_mask = HV_PGSZ_MASK_256MB;
+               hv_pgsz_idx = HV_PGSZ_IDX_256MB;
+               break;
+       case HPAGE_SHIFT:
+               hv_pgsz_mask = HV_PGSZ_MASK_4MB;
+               hv_pgsz_idx = HV_PGSZ_IDX_4MB;
+               break;
+       case HPAGE_64K_SHIFT:
+               hv_pgsz_mask = HV_PGSZ_MASK_64K;
+               hv_pgsz_idx = HV_PGSZ_IDX_64K;
+               break;
+       default:
+               hv_pgsz_mask = 0;
+       }
+
+       if ((hv_pgsz_mask & cpu_pgsz_mask) == 0U) {
+               pr_warn("hugepagesz=%llu not supported by MMU.\n",
+                       hugepage_size);
+               goto out;
+       }
+
+       hugetlb_add_hstate(hugepage_shift - PAGE_SHIFT);
+       rc = 1;
+
+out:
+       return rc;
+}
+__setup("hugepagesz=", setup_hugepagesz);
+#endif /* CONFIG_HUGETLB_PAGE */
+
 void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep)
 {
        struct mm_struct *mm;
@@ -347,7 +391,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *
 
 #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
        if ((mm->context.hugetlb_pte_count || mm->context.thp_pte_count) &&
-           is_hugetlb_pte(pte)) {
+           is_hugetlb_pmd(__pmd(pte_val(pte)))) {
                /* We are fabricating 8MB pages using 4MB real hw pages.  */
                pte_val(pte) |= (address & (1UL << REAL_HPAGE_SHIFT));
                __update_mmu_tsb_insert(mm, MM_TSB_HUGE, REAL_HPAGE_SHIFT,
@@ -785,13 +829,23 @@ static void __init find_ramdisk(unsigned long phys_base)
 
 struct node_mem_mask {
        unsigned long mask;
-       unsigned long val;
+       unsigned long match;
 };
 static struct node_mem_mask node_masks[MAX_NUMNODES];
 static int num_node_masks;
 
 #ifdef CONFIG_NEED_MULTIPLE_NODES
 
+struct mdesc_mlgroup {
+       u64     node;
+       u64     latency;
+       u64     match;
+       u64     mask;
+};
+
+static struct mdesc_mlgroup *mlgroups;
+static int num_mlgroups;
+
 int numa_cpu_lookup_table[NR_CPUS];
 cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
 
@@ -802,78 +856,129 @@ struct mdesc_mblock {
 };
 static struct mdesc_mblock *mblocks;
 static int num_mblocks;
-static int find_numa_node_for_addr(unsigned long pa,
-                                  struct node_mem_mask *pnode_mask);
 
-static unsigned long __init ra_to_pa(unsigned long addr)
+static struct mdesc_mblock * __init addr_to_mblock(unsigned long addr)
 {
+       struct mdesc_mblock *m = NULL;
        int i;
 
        for (i = 0; i < num_mblocks; i++) {
-               struct mdesc_mblock *m = &mblocks[i];
+               m = &mblocks[i];
 
                if (addr >= m->base &&
                    addr < (m->base + m->size)) {
-                       addr += m->offset;
                        break;
                }
        }
-       return addr;
+
+       return m;
 }
 
-static int __init find_node(unsigned long addr)
+static u64 __init memblock_nid_range_sun4u(u64 start, u64 end, int *nid)
 {
-       static bool search_mdesc = true;
-       static struct node_mem_mask last_mem_mask = { ~0UL, ~0UL };
-       static int last_index;
-       int i;
+       int prev_nid, new_nid;
 
-       addr = ra_to_pa(addr);
-       for (i = 0; i < num_node_masks; i++) {
-               struct node_mem_mask *p = &node_masks[i];
+       prev_nid = -1;
+       for ( ; start < end; start += PAGE_SIZE) {
+               for (new_nid = 0; new_nid < num_node_masks; new_nid++) {
+                       struct node_mem_mask *p = &node_masks[new_nid];
 
-               if ((addr & p->mask) == p->val)
-                       return i;
-       }
-       /* The following condition has been observed on LDOM guests because
-        * node_masks only contains the best latency mask and value.
-        * LDOM guest's mdesc can contain a single latency group to
-        * cover multiple address range. Print warning message only if the
-        * address cannot be found in node_masks nor mdesc.
-        */
-       if ((search_mdesc) &&
-           ((addr & last_mem_mask.mask) != last_mem_mask.val)) {
-               /* find the available node in the mdesc */
-               last_index = find_numa_node_for_addr(addr, &last_mem_mask);
-               numadbg("find_node: latency group for address 0x%lx is %d\n",
-                       addr, last_index);
-               if ((last_index < 0) || (last_index >= num_node_masks)) {
-                       /* WARN_ONCE() and use default group 0 */
-                       WARN_ONCE(1, "find_node: A physical address doesn't match a NUMA node rule. Some physical memory will be owned by node 0.");
-                       search_mdesc = false;
-                       last_index = 0;
+                       if ((start & p->mask) == p->match) {
+                               if (prev_nid == -1)
+                                       prev_nid = new_nid;
+                               break;
+                       }
                }
+
+               if (new_nid == num_node_masks) {
+                       prev_nid = 0;
+                       WARN_ONCE(1, "addr[%Lx] doesn't match a NUMA node rule. Some memory will be owned by node 0.",
+                                 start);
+                       break;
+               }
+
+               if (prev_nid != new_nid)
+                       break;
        }
+       *nid = prev_nid;
 
-       return last_index;
+       return start > end ? end : start;
 }
 
 static u64 __init memblock_nid_range(u64 start, u64 end, int *nid)
 {
-       *nid = find_node(start);
-       start += PAGE_SIZE;
-       while (start < end) {
-               int n = find_node(start);
+       u64 ret_end, pa_start, m_mask, m_match, m_end;
+       struct mdesc_mblock *mblock;
+       int _nid, i;
+
+       if (tlb_type != hypervisor)
+               return memblock_nid_range_sun4u(start, end, nid);
+
+       mblock = addr_to_mblock(start);
+       if (!mblock) {
+               WARN_ONCE(1, "memblock_nid_range: Can't find mblock addr[%Lx]",
+                         start);
+
+               _nid = 0;
+               ret_end = end;
+               goto done;
+       }
+
+       pa_start = start + mblock->offset;
+       m_match = 0;
+       m_mask = 0;
+
+       for (_nid = 0; _nid < num_node_masks; _nid++) {
+               struct node_mem_mask *const m = &node_masks[_nid];
 
-               if (n != *nid)
+               if ((pa_start & m->mask) == m->match) {
+                       m_match = m->match;
+                       m_mask = m->mask;
                        break;
-               start += PAGE_SIZE;
+               }
        }
 
-       if (start > end)
-               start = end;
+       if (num_node_masks == _nid) {
+               /* We could not find NUMA group, so default to 0, but lets
+                * search for latency group, so we could calculate the correct
+                * end address that we return
+                */
+               _nid = 0;
+
+               for (i = 0; i < num_mlgroups; i++) {
+                       struct mdesc_mlgroup *const m = &mlgroups[i];
+
+                       if ((pa_start & m->mask) == m->match) {
+                               m_match = m->match;
+                               m_mask = m->mask;
+                               break;
+                       }
+               }
+
+               if (i == num_mlgroups) {
+                       WARN_ONCE(1, "memblock_nid_range: Can't find latency group addr[%Lx]",
+                                 start);
+
+                       ret_end = end;
+                       goto done;
+               }
+       }
 
-       return start;
+       /*
+        * Each latency group has match and mask, and each memory block has an
+        * offset.  An address belongs to a latency group if its address matches
+        * the following formula: ((addr + offset) & mask) == match
+        * It is, however, slow to check every single page if it matches a
+        * particular latency group. As optimization we calculate end value by
+        * using bit arithmetics.
+        */
+       m_end = m_match + (1ul << __ffs(m_mask)) - mblock->offset;
+       m_end += pa_start & ~((1ul << fls64(m_mask)) - 1);
+       ret_end = m_end > end ? end : m_end;
+
+done:
+       *nid = _nid;
+       return ret_end;
 }
 #endif
 
@@ -914,7 +1019,8 @@ static void init_node_masks_nonnuma(void)
 
        numadbg("Initializing tables for non-numa.\n");
 
-       node_masks[0].mask = node_masks[0].val = 0;
+       node_masks[0].mask = 0;
+       node_masks[0].match = 0;
        num_node_masks = 1;
 
 #ifdef CONFIG_NEED_MULTIPLE_NODES
@@ -932,15 +1038,6 @@ EXPORT_SYMBOL(numa_cpu_lookup_table);
 EXPORT_SYMBOL(numa_cpumask_lookup_table);
 EXPORT_SYMBOL(node_data);
 
-struct mdesc_mlgroup {
-       u64     node;
-       u64     latency;
-       u64     match;
-       u64     mask;
-};
-static struct mdesc_mlgroup *mlgroups;
-static int num_mlgroups;
-
 static int scan_pio_for_cfg_handle(struct mdesc_handle *md, u64 pio,
                                   u32 cfg_handle)
 {
@@ -1029,6 +1126,10 @@ int of_node_to_nid(struct device_node *dp)
 static void __init add_node_ranges(void)
 {
        struct memblock_region *reg;
+       unsigned long prev_max;
+
+memblock_resized:
+       prev_max = memblock.memory.max;
 
        for_each_memblock(memory, reg) {
                unsigned long size = reg->size;
@@ -1048,6 +1149,8 @@ static void __init add_node_ranges(void)
 
                        memblock_set_node(start, this_end - start,
                                          &memblock.memory, nid);
+                       if (memblock.memory.max != prev_max)
+                               goto memblock_resized;
                        start = this_end;
                }
        }
@@ -1182,41 +1285,6 @@ int __node_distance(int from, int to)
        return numa_latency[from][to];
 }
 
-static int find_numa_node_for_addr(unsigned long pa,
-                                  struct node_mem_mask *pnode_mask)
-{
-       struct mdesc_handle *md = mdesc_grab();
-       u64 node, arc;
-       int i = 0;
-
-       node = mdesc_node_by_name(md, MDESC_NODE_NULL, "latency-groups");
-       if (node == MDESC_NODE_NULL)
-               goto out;
-
-       mdesc_for_each_node_by_name(md, node, "group") {
-               mdesc_for_each_arc(arc, md, node, MDESC_ARC_TYPE_FWD) {
-                       u64 target = mdesc_arc_target(md, arc);
-                       struct mdesc_mlgroup *m = find_mlgroup(target);
-
-                       if (!m)
-                               continue;
-                       if ((pa & m->mask) == m->match) {
-                               if (pnode_mask) {
-                                       pnode_mask->mask = m->mask;
-                                       pnode_mask->val = m->match;
-                               }
-                               mdesc_release(md);
-                               return i;
-                       }
-               }
-               i++;
-       }
-
-out:
-       mdesc_release(md);
-       return -1;
-}
-
 static int __init find_best_numa_node_for_mlgroup(struct mdesc_mlgroup *grp)
 {
        int i;
@@ -1224,7 +1292,7 @@ static int __init find_best_numa_node_for_mlgroup(struct mdesc_mlgroup *grp)
        for (i = 0; i < MAX_NUMNODES; i++) {
                struct node_mem_mask *n = &node_masks[i];
 
-               if ((grp->mask == n->mask) && (grp->match == n->val))
+               if ((grp->mask == n->mask) && (grp->match == n->match))
                        break;
        }
        return i;
@@ -1279,10 +1347,10 @@ static int __init numa_attach_mlgroup(struct mdesc_handle *md, u64 grp,
        n = &node_masks[num_node_masks++];
 
        n->mask = candidate->mask;
-       n->val = candidate->match;
+       n->match = candidate->match;
 
-       numadbg("NUMA NODE[%d]: mask[%lx] val[%lx] (latency[%llx])\n",
-               index, n->mask, n->val, candidate->latency);
+       numadbg("NUMA NODE[%d]: mask[%lx] match[%lx] (latency[%llx])\n",
+               index, n->mask, n->match, candidate->latency);
 
        return 0;
 }
@@ -1379,7 +1447,7 @@ static int __init numa_parse_jbus(void)
                numa_cpu_lookup_table[cpu] = index;
                cpumask_copy(&numa_cpumask_lookup_table[index], cpumask_of(cpu));
                node_masks[index].mask = ~((1UL << 36UL) - 1UL);
-               node_masks[index].val = cpu << 36UL;
+               node_masks[index].match = cpu << 36UL;
 
                index++;
        }
index c7f2a5295b3a54599b68e4932c1c68cf2bdab99f..def82f6d626f774772807427c6fe6e67fd343bf1 100644 (file)
@@ -1444,7 +1444,7 @@ static void poke_viking(void)
        srmmu_set_mmureg(mreg);
 }
 
-static struct sparc32_cachetlb_ops viking_ops = {
+static struct sparc32_cachetlb_ops viking_ops __ro_after_init = {
        .cache_all      = viking_flush_cache_all,
        .cache_mm       = viking_flush_cache_mm,
        .cache_page     = viking_flush_cache_page,
@@ -1475,7 +1475,7 @@ static struct sparc32_cachetlb_ops viking_ops = {
  * flushes going at once will require SMP locking anyways so there's
  * no real value in trying any harder than this.
  */
-static struct sparc32_cachetlb_ops viking_sun4d_smp_ops = {
+static struct sparc32_cachetlb_ops viking_sun4d_smp_ops __ro_after_init = {
        .cache_all      = viking_flush_cache_all,
        .cache_mm       = viking_flush_cache_mm,
        .cache_page     = viking_flush_cache_page,
@@ -1759,7 +1759,7 @@ static void smp_flush_sig_insns(struct mm_struct *mm, unsigned long insn_addr)
        local_ops->sig_insns(mm, insn_addr);
 }
 
-static struct sparc32_cachetlb_ops smp_cachetlb_ops = {
+static struct sparc32_cachetlb_ops smp_cachetlb_ops __ro_after_init = {
        .cache_all      = smp_flush_cache_all,
        .cache_mm       = smp_flush_cache_mm,
        .cache_page     = smp_flush_cache_page,
index c56a195c90719fc3eb1400ad14e6b3ae27bb8417..afda3bbf78542a0297849d65fe7470a5e73716f1 100644 (file)
@@ -67,7 +67,7 @@ void arch_leave_lazy_mmu_mode(void)
 }
 
 static void tlb_batch_add_one(struct mm_struct *mm, unsigned long vaddr,
-                             bool exec, bool huge)
+                             bool exec, unsigned int hugepage_shift)
 {
        struct tlb_batch *tb = &get_cpu_var(tlb_batch);
        unsigned long nr;
@@ -84,19 +84,19 @@ static void tlb_batch_add_one(struct mm_struct *mm, unsigned long vaddr,
        }
 
        if (!tb->active) {
-               flush_tsb_user_page(mm, vaddr, huge);
+               flush_tsb_user_page(mm, vaddr, hugepage_shift);
                global_flush_tlb_page(mm, vaddr);
                goto out;
        }
 
        if (nr == 0) {
                tb->mm = mm;
-               tb->huge = huge;
+               tb->hugepage_shift = hugepage_shift;
        }
 
-       if (tb->huge != huge) {
+       if (tb->hugepage_shift != hugepage_shift) {
                flush_tlb_pending();
-               tb->huge = huge;
+               tb->hugepage_shift = hugepage_shift;
                nr = 0;
        }
 
@@ -110,10 +110,9 @@ out:
 }
 
 void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr,
-                  pte_t *ptep, pte_t orig, int fullmm)
+                  pte_t *ptep, pte_t orig, int fullmm,
+                  unsigned int hugepage_shift)
 {
-       bool huge = is_hugetlb_pte(orig);
-
        if (tlb_type != hypervisor &&
            pte_dirty(orig)) {
                unsigned long paddr, pfn = pte_pfn(orig);
@@ -139,7 +138,7 @@ void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr,
 
 no_cache_flush:
        if (!fullmm)
-               tlb_batch_add_one(mm, vaddr, pte_exec(orig), huge);
+               tlb_batch_add_one(mm, vaddr, pte_exec(orig), hugepage_shift);
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
index e20fbbafb0b04af0fa85b21188cd6c851c132e6e..23479c3d39f0221a98c86cf3923a040b3b3be658 100644 (file)
@@ -86,6 +86,33 @@ static void __flush_tsb_one(struct tlb_batch *tb, unsigned long hash_shift,
                __flush_tsb_one_entry(tsb, tb->vaddrs[i], hash_shift, nentries);
 }
 
+#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
+static void __flush_huge_tsb_one_entry(unsigned long tsb, unsigned long v,
+                                      unsigned long hash_shift,
+                                      unsigned long nentries,
+                                      unsigned int hugepage_shift)
+{
+       unsigned int hpage_entries;
+       unsigned int i;
+
+       hpage_entries = 1 << (hugepage_shift - hash_shift);
+       for (i = 0; i < hpage_entries; i++)
+               __flush_tsb_one_entry(tsb, v + (i << hash_shift), hash_shift,
+                                     nentries);
+}
+
+static void __flush_huge_tsb_one(struct tlb_batch *tb, unsigned long hash_shift,
+                                unsigned long tsb, unsigned long nentries,
+                                unsigned int hugepage_shift)
+{
+       unsigned long i;
+
+       for (i = 0; i < tb->tlb_nr; i++)
+               __flush_huge_tsb_one_entry(tsb, tb->vaddrs[i], hash_shift,
+                                          nentries, hugepage_shift);
+}
+#endif
+
 void flush_tsb_user(struct tlb_batch *tb)
 {
        struct mm_struct *mm = tb->mm;
@@ -93,45 +120,61 @@ void flush_tsb_user(struct tlb_batch *tb)
 
        spin_lock_irqsave(&mm->context.lock, flags);
 
-       if (!tb->huge) {
+       if (tb->hugepage_shift < HPAGE_SHIFT) {
                base = (unsigned long) mm->context.tsb_block[MM_TSB_BASE].tsb;
                nentries = mm->context.tsb_block[MM_TSB_BASE].tsb_nentries;
                if (tlb_type == cheetah_plus || tlb_type == hypervisor)
                        base = __pa(base);
-               __flush_tsb_one(tb, PAGE_SHIFT, base, nentries);
+               if (tb->hugepage_shift == PAGE_SHIFT)
+                       __flush_tsb_one(tb, PAGE_SHIFT, base, nentries);
+#if defined(CONFIG_HUGETLB_PAGE)
+               else
+                       __flush_huge_tsb_one(tb, PAGE_SHIFT, base, nentries,
+                                            tb->hugepage_shift);
+#endif
        }
 #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
-       if (tb->huge && mm->context.tsb_block[MM_TSB_HUGE].tsb) {
+       else if (mm->context.tsb_block[MM_TSB_HUGE].tsb) {
                base = (unsigned long) mm->context.tsb_block[MM_TSB_HUGE].tsb;
                nentries = mm->context.tsb_block[MM_TSB_HUGE].tsb_nentries;
                if (tlb_type == cheetah_plus || tlb_type == hypervisor)
                        base = __pa(base);
-               __flush_tsb_one(tb, REAL_HPAGE_SHIFT, base, nentries);
+               __flush_huge_tsb_one(tb, REAL_HPAGE_SHIFT, base, nentries,
+                                    tb->hugepage_shift);
        }
 #endif
        spin_unlock_irqrestore(&mm->context.lock, flags);
 }
 
-void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr, bool huge)
+void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr,
+                        unsigned int hugepage_shift)
 {
        unsigned long nentries, base, flags;
 
        spin_lock_irqsave(&mm->context.lock, flags);
 
-       if (!huge) {
+       if (hugepage_shift < HPAGE_SHIFT) {
                base = (unsigned long) mm->context.tsb_block[MM_TSB_BASE].tsb;
                nentries = mm->context.tsb_block[MM_TSB_BASE].tsb_nentries;
                if (tlb_type == cheetah_plus || tlb_type == hypervisor)
                        base = __pa(base);
-               __flush_tsb_one_entry(base, vaddr, PAGE_SHIFT, nentries);
+               if (hugepage_shift == PAGE_SHIFT)
+                       __flush_tsb_one_entry(base, vaddr, PAGE_SHIFT,
+                                             nentries);
+#if defined(CONFIG_HUGETLB_PAGE)
+               else
+                       __flush_huge_tsb_one_entry(base, vaddr, PAGE_SHIFT,
+                                                  nentries, hugepage_shift);
+#endif
        }
 #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
-       if (huge && mm->context.tsb_block[MM_TSB_HUGE].tsb) {
+       else if (mm->context.tsb_block[MM_TSB_HUGE].tsb) {
                base = (unsigned long) mm->context.tsb_block[MM_TSB_HUGE].tsb;
                nentries = mm->context.tsb_block[MM_TSB_HUGE].tsb_nentries;
                if (tlb_type == cheetah_plus || tlb_type == hypervisor)
                        base = __pa(base);
-               __flush_tsb_one_entry(base, vaddr, REAL_HPAGE_SHIFT, nentries);
+               __flush_huge_tsb_one_entry(base, vaddr, REAL_HPAGE_SHIFT,
+                                          nentries, hugepage_shift);
        }
 #endif
        spin_unlock_irqrestore(&mm->context.lock, flags);
index 0715ce93daef42001407f690912a1b2a437e5a6e..58fc8684788d1f9fe7894c5afefa46c05692bf27 100644 (file)
@@ -69,50 +69,6 @@ config MQ_IOSCHED_DEADLINE
        ---help---
          MQ version of the deadline IO scheduler.
 
-config MQ_IOSCHED_NONE
-       bool
-       default y
-
-choice
-       prompt "Default single-queue blk-mq I/O scheduler"
-       default DEFAULT_SQ_NONE
-       help
-         Select the I/O scheduler which will be used by default for blk-mq
-         managed block devices with a single queue.
-
-       config DEFAULT_SQ_DEADLINE
-               bool "MQ Deadline" if MQ_IOSCHED_DEADLINE=y
-
-       config DEFAULT_SQ_NONE
-               bool "None"
-
-endchoice
-
-config DEFAULT_SQ_IOSCHED
-       string
-       default "mq-deadline" if DEFAULT_SQ_DEADLINE
-       default "none" if DEFAULT_SQ_NONE
-
-choice
-       prompt "Default multi-queue blk-mq I/O scheduler"
-       default DEFAULT_MQ_NONE
-       help
-         Select the I/O scheduler which will be used by default for blk-mq
-         managed block devices with multiple queues.
-
-       config DEFAULT_MQ_DEADLINE
-               bool "MQ Deadline" if MQ_IOSCHED_DEADLINE=y
-
-       config DEFAULT_MQ_NONE
-               bool "None"
-
-endchoice
-
-config DEFAULT_MQ_IOSCHED
-       string
-       default "mq-deadline" if DEFAULT_MQ_DEADLINE
-       default "none" if DEFAULT_MQ_NONE
-
 endmenu
 
 endif
index 4b564d0c3e29a4c15becf545a996794b296ae622..5eec5e08417f6ff1989e3e2a07b31c62901953d5 100644 (file)
@@ -625,21 +625,20 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
 }
 EXPORT_SYMBOL(bio_clone_fast);
 
-/**
- *     bio_clone_bioset - clone a bio
- *     @bio_src: bio to clone
- *     @gfp_mask: allocation priority
- *     @bs: bio_set to allocate from
- *
- *     Clone bio. Caller will own the returned bio, but not the actual data it
- *     points to. Reference count of returned bio will be one.
- */
-struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
-                            struct bio_set *bs)
+static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
+                                     struct bio_set *bs, int offset,
+                                     int size)
 {
        struct bvec_iter iter;
        struct bio_vec bv;
        struct bio *bio;
+       struct bvec_iter iter_src = bio_src->bi_iter;
+
+       /* for supporting partial clone */
+       if (offset || size != bio_src->bi_iter.bi_size) {
+               bio_advance_iter(bio_src, &iter_src, offset);
+               iter_src.bi_size = size;
+       }
 
        /*
         * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
@@ -663,7 +662,8 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
         *    __bio_clone_fast() anyways.
         */
 
-       bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
+       bio = bio_alloc_bioset(gfp_mask, __bio_segments(bio_src,
+                              &iter_src), bs);
        if (!bio)
                return NULL;
        bio->bi_bdev            = bio_src->bi_bdev;
@@ -680,7 +680,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
                bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
                break;
        default:
-               bio_for_each_segment(bv, bio_src, iter)
+               __bio_for_each_segment(bv, bio_src, iter, iter_src)
                        bio->bi_io_vec[bio->bi_vcnt++] = bv;
                break;
        }
@@ -699,8 +699,43 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 
        return bio;
 }
+
+/**
+ *     bio_clone_bioset - clone a bio
+ *     @bio_src: bio to clone
+ *     @gfp_mask: allocation priority
+ *     @bs: bio_set to allocate from
+ *
+ *     Clone bio. Caller will own the returned bio, but not the actual data it
+ *     points to. Reference count of returned bio will be one.
+ */
+struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
+                            struct bio_set *bs)
+{
+       return __bio_clone_bioset(bio_src, gfp_mask, bs, 0,
+                                 bio_src->bi_iter.bi_size);
+}
 EXPORT_SYMBOL(bio_clone_bioset);
 
+/**
+ *     bio_clone_bioset_partial - clone a partial bio
+ *     @bio_src: bio to clone
+ *     @gfp_mask: allocation priority
+ *     @bs: bio_set to allocate from
+ *     @offset: cloned starting from the offset
+ *     @size: size for the cloned bio
+ *
+ *     Clone bio. Caller will own the returned bio, but not the actual data it
+ *     points to. Reference count of returned bio will be one.
+ */
+struct bio *bio_clone_bioset_partial(struct bio *bio_src, gfp_t gfp_mask,
+                                    struct bio_set *bs, int offset,
+                                    int size)
+{
+       return __bio_clone_bioset(bio_src, gfp_mask, bs, offset, size);
+}
+EXPORT_SYMBOL(bio_clone_bioset_partial);
+
 /**
  *     bio_add_pc_page -       attempt to add page to bio
  *     @q: the target queue
index 9e8d6795a8c1be7eee1c727bb376f209da432ecf..98c7b061781e55f0176bfc24c6345c0c0611f1f3 100644 (file)
@@ -205,7 +205,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
         * needing a restart in that case.
         */
        if (!list_empty(&rq_list)) {
-               blk_mq_sched_mark_restart(hctx);
+               blk_mq_sched_mark_restart_hctx(hctx);
                did_work = blk_mq_dispatch_rq_list(hctx, &rq_list);
        } else if (!has_sched_dispatch) {
                blk_mq_flush_busy_ctxs(hctx, &rq_list);
@@ -331,20 +331,16 @@ static void blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
 
 void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx)
 {
+       struct request_queue *q = hctx->queue;
        unsigned int i;
 
-       if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
+       if (test_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) {
+               if (test_and_clear_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) {
+                       queue_for_each_hw_ctx(q, hctx, i)
+                               blk_mq_sched_restart_hctx(hctx);
+               }
+       } else {
                blk_mq_sched_restart_hctx(hctx);
-       else {
-               struct request_queue *q = hctx->queue;
-
-               if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags))
-                       return;
-
-               clear_bit(QUEUE_FLAG_RESTART, &q->queue_flags);
-
-               queue_for_each_hw_ctx(q, hctx, i)
-                       blk_mq_sched_restart_hctx(hctx);
        }
 }
 
@@ -498,15 +494,6 @@ int blk_mq_sched_init(struct request_queue *q)
 {
        int ret;
 
-#if defined(CONFIG_DEFAULT_SQ_NONE)
-       if (q->nr_hw_queues == 1)
-               return 0;
-#endif
-#if defined(CONFIG_DEFAULT_MQ_NONE)
-       if (q->nr_hw_queues > 1)
-               return 0;
-#endif
-
        mutex_lock(&q->sysfs_lock);
        ret = elevator_init(q, NULL);
        mutex_unlock(&q->sysfs_lock);
index 7b5f3b95c78e93c41d7e680090fa67b638ea5be2..a75b16b123f7aadac672651a7eef5c79f5553e16 100644 (file)
@@ -122,17 +122,27 @@ static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
        return false;
 }
 
-static inline void blk_mq_sched_mark_restart(struct blk_mq_hw_ctx *hctx)
+/*
+ * Mark a hardware queue as needing a restart.
+ */
+static inline void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
 {
-       if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
+       if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
                set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
-               if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
-                       struct request_queue *q = hctx->queue;
+}
+
+/*
+ * Mark a hardware queue and the request queue it belongs to as needing a
+ * restart.
+ */
+static inline void blk_mq_sched_mark_restart_queue(struct blk_mq_hw_ctx *hctx)
+{
+       struct request_queue *q = hctx->queue;
 
-                       if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags))
-                               set_bit(QUEUE_FLAG_RESTART, &q->queue_flags);
-               }
-       }
+       if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+               set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+       if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags))
+               set_bit(QUEUE_FLAG_RESTART, &q->queue_flags);
 }
 
 static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx)
index b29e7dc7b309e4cf939cd2c011b26b38dfd4df73..9e6b064e533979446a936c45c18f500c6f87725b 100644 (file)
@@ -904,6 +904,44 @@ static bool reorder_tags_to_front(struct list_head *list)
        return first != NULL;
 }
 
+static int blk_mq_dispatch_wake(wait_queue_t *wait, unsigned mode, int flags,
+                               void *key)
+{
+       struct blk_mq_hw_ctx *hctx;
+
+       hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
+
+       list_del(&wait->task_list);
+       clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
+       blk_mq_run_hw_queue(hctx, true);
+       return 1;
+}
+
+static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
+{
+       struct sbq_wait_state *ws;
+
+       /*
+        * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait.
+        * The thread which wins the race to grab this bit adds the hardware
+        * queue to the wait queue.
+        */
+       if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) ||
+           test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state))
+               return false;
+
+       init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
+       ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx);
+
+       /*
+        * As soon as this returns, it's no longer safe to fiddle with
+        * hctx->dispatch_wait, since a completion can wake up the wait queue
+        * and unlock the bit.
+        */
+       add_wait_queue(&ws->wait, &hctx->dispatch_wait);
+       return true;
+}
+
 bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 {
        struct request_queue *q = hctx->queue;
@@ -931,15 +969,22 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
                                continue;
 
                        /*
-                        * We failed getting a driver tag. Mark the queue(s)
-                        * as needing a restart. Retry getting a tag again,
-                        * in case the needed IO completed right before we
-                        * marked the queue as needing a restart.
+                        * The initial allocation attempt failed, so we need to
+                        * rerun the hardware queue when a tag is freed.
                         */
-                       blk_mq_sched_mark_restart(hctx);
-                       if (!blk_mq_get_driver_tag(rq, &hctx, false))
+                       if (blk_mq_dispatch_wait_add(hctx)) {
+                               /*
+                                * It's possible that a tag was freed in the
+                                * window between the allocation failure and
+                                * adding the hardware queue to the wait queue.
+                                */
+                               if (!blk_mq_get_driver_tag(rq, &hctx, false))
+                                       break;
+                       } else {
                                break;
+                       }
                }
+
                list_del_init(&rq->queuelist);
 
                bd.rq = rq;
@@ -995,10 +1040,11 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
                 *
                 * blk_mq_run_hw_queue() already checks the STOPPED bit
                 *
-                * If RESTART is set, then let completion restart the queue
-                * instead of potentially looping here.
+                * If RESTART or TAG_WAITING is set, then let completion restart
+                * the queue instead of potentially looping here.
                 */
-               if (!blk_mq_sched_needs_restart(hctx))
+               if (!blk_mq_sched_needs_restart(hctx) &&
+                   !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))
                        blk_mq_run_hw_queue(hctx, true);
        }
 
index 699d10f71a2cac3f871bb6879f68f1fc5fafe714..ac1c9f481a9895525b98601cf96837fd5b4015b6 100644 (file)
@@ -220,17 +220,24 @@ int elevator_init(struct request_queue *q, char *name)
        }
 
        if (!e) {
-               if (q->mq_ops && q->nr_hw_queues == 1)
-                       e = elevator_get(CONFIG_DEFAULT_SQ_IOSCHED, false);
-               else if (q->mq_ops)
-                       e = elevator_get(CONFIG_DEFAULT_MQ_IOSCHED, false);
-               else
+               /*
+                * For blk-mq devices, we default to using mq-deadline,
+                * if available, for single queue devices. If deadline
+                * isn't available OR we have multiple queues, default
+                * to "none".
+                */
+               if (q->mq_ops) {
+                       if (q->nr_hw_queues == 1)
+                               e = elevator_get("mq-deadline", false);
+                       if (!e)
+                               return 0;
+               } else
                        e = elevator_get(CONFIG_DEFAULT_IOSCHED, false);
 
                if (!e) {
                        printk(KERN_ERR
                                "Default I/O scheduler not found. " \
-                               "Using noop/none.\n");
+                               "Using noop.\n");
                        e = elevator_get("noop", false);
                }
        }
index 3631cd4802955247d27316c10b172744700e6340..2f444b87a5f244db1bd65fea29eea1daae265c1a 100644 (file)
@@ -669,14 +669,14 @@ void del_gendisk(struct gendisk *disk)
        disk_part_iter_init(&piter, disk,
                             DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
        while ((part = disk_part_iter_next(&piter))) {
-               bdev_unhash_inode(MKDEV(disk->major,
-                                       disk->first_minor + part->partno));
                invalidate_partition(disk, part->partno);
+               bdev_unhash_inode(part_devt(part));
                delete_partition(disk, part->partno);
        }
        disk_part_iter_exit(&piter);
 
        invalidate_partition(disk, 0);
+       bdev_unhash_inode(disk_devt(disk));
        set_capacity(disk, 0);
        disk->flags &= ~GENHD_FL_UP;
 
index d1c52ba4d62dcf7b7a34870ec6b3095eed8b5c15..1e18dca360fc501033762d4c505c2e32c4674ee6 100644 (file)
 #define IO_BUFFER_LENGTH 2048
 #define MAX_TOKS 64
 
-typedef int (*opal_step)(struct opal_dev *dev);
+struct opal_step {
+       int (*fn)(struct opal_dev *dev, void *data);
+       void *data;
+};
+typedef int (cont_fn)(struct opal_dev *dev);
 
 enum opal_atom_width {
        OPAL_WIDTH_TINY,
@@ -80,9 +84,7 @@ struct opal_dev {
        void *data;
        sec_send_recv *send_recv;
 
-       const opal_step *funcs;
-       void **func_data;
-       int state;
+       const struct opal_step *steps;
        struct mutex dev_lock;
        u16 comid;
        u32 hsn;
@@ -213,8 +215,6 @@ static const u8 opalmethod[][OPAL_UID_LENGTH] = {
                { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x08, 0x03 },
 };
 
-typedef int (cont_fn)(struct opal_dev *dev);
-
 static int end_opal_session_error(struct opal_dev *dev);
 
 struct opal_suspend_data {
@@ -375,18 +375,18 @@ static void check_geometry(struct opal_dev *dev, const void *data)
 
 static int next(struct opal_dev *dev)
 {
-       opal_step func;
-       int error = 0;
+       const struct opal_step *step;
+       int state = 0, error = 0;
 
        do {
-               func = dev->funcs[dev->state];
-               if (!func)
+               step = &dev->steps[state];
+               if (!step->fn)
                        break;
 
-               error = func(dev);
+               error = step->fn(dev, step->data);
                if (error) {
                        pr_err("Error on step function: %d with error %d: %s\n",
-                              dev->state, error,
+                              state, error,
                               opal_error_to_human(error));
 
                        /* For each OPAL command we do a discovery0 then we
@@ -396,10 +396,13 @@ static int next(struct opal_dev *dev)
                         * session. Therefore we shouldn't attempt to terminate
                         * a session, as one has not yet been created.
                         */
-                       if (dev->state > 1)
-                               return end_opal_session_error(dev);
+                       if (state > 1) {
+                               end_opal_session_error(dev);
+                               return error;
+                       }
+
                }
-               dev->state++;
+               state++;
        } while (!error);
 
        return error;
@@ -411,10 +414,17 @@ static int opal_discovery0_end(struct opal_dev *dev)
        const struct d0_header *hdr = (struct d0_header *)dev->resp;
        const u8 *epos = dev->resp, *cpos = dev->resp;
        u16 comid = 0;
+       u32 hlen = be32_to_cpu(hdr->length);
+
+       print_buffer(dev->resp, hlen);
 
-       print_buffer(dev->resp, be32_to_cpu(hdr->length));
+       if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) {
+               pr_warn("Discovery length overflows buffer (%zu+%u)/%u\n",
+                       sizeof(*hdr), hlen, IO_BUFFER_LENGTH);
+               return -EFAULT;
+       }
 
-       epos += be32_to_cpu(hdr->length); /* end of buffer */
+       epos += hlen; /* end of buffer */
        cpos += sizeof(*hdr); /* current position on buffer */
 
        while (cpos < epos && supported) {
@@ -476,7 +486,7 @@ static int opal_discovery0_end(struct opal_dev *dev)
        return 0;
 }
 
-static int opal_discovery0(struct opal_dev *dev)
+static int opal_discovery0(struct opal_dev *dev, void *data)
 {
        int ret;
 
@@ -662,52 +672,29 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn)
        return 0;
 }
 
-static enum opal_response_token token_type(const struct parsed_resp *resp,
-                                          int n)
-{
-       const struct opal_resp_tok *tok;
-
-       if (n >= resp->num) {
-               pr_err("Token number doesn't exist: %d, resp: %d\n",
-                      n, resp->num);
-               return OPAL_DTA_TOKENID_INVALID;
-       }
-
-       tok = &resp->toks[n];
-       if (tok->len == 0) {
-               pr_err("Token length must be non-zero\n");
-               return OPAL_DTA_TOKENID_INVALID;
-       }
-
-       return tok->type;
-}
-
-/*
- * This function returns 0 in case of invalid token. One should call
- * token_type() first to find out if the token is valid or not.
- */
-static enum opal_token response_get_token(const struct parsed_resp *resp,
-                                         int n)
+static const struct opal_resp_tok *response_get_token(
+                               const struct parsed_resp *resp,
+                               int n)
 {
        const struct opal_resp_tok *tok;
 
        if (n >= resp->num) {
                pr_err("Token number doesn't exist: %d, resp: %d\n",
                       n, resp->num);
-               return 0;
+               return ERR_PTR(-EINVAL);
        }
 
        tok = &resp->toks[n];
        if (tok->len == 0) {
                pr_err("Token length must be non-zero\n");
-               return 0;
+               return ERR_PTR(-EINVAL);
        }
 
-       return tok->pos[0];
+       return tok;
 }
 
-static size_t response_parse_tiny(struct opal_resp_tok *tok,
-                                 const u8 *pos)
+static ssize_t response_parse_tiny(struct opal_resp_tok *tok,
+                                  const u8 *pos)
 {
        tok->pos = pos;
        tok->len = 1;
@@ -723,8 +710,8 @@ static size_t response_parse_tiny(struct opal_resp_tok *tok,
        return tok->len;
 }
 
-static size_t response_parse_short(struct opal_resp_tok *tok,
-                                  const u8 *pos)
+static ssize_t response_parse_short(struct opal_resp_tok *tok,
+                                   const u8 *pos)
 {
        tok->pos = pos;
        tok->len = (pos[0] & SHORT_ATOM_LEN_MASK) + 1;
@@ -736,7 +723,7 @@ static size_t response_parse_short(struct opal_resp_tok *tok,
                tok->type = OPAL_DTA_TOKENID_SINT;
        } else {
                u64 u_integer = 0;
-               int i, b = 0;
+               ssize_t i, b = 0;
 
                tok->type = OPAL_DTA_TOKENID_UINT;
                if (tok->len > 9) {
@@ -753,8 +740,8 @@ static size_t response_parse_short(struct opal_resp_tok *tok,
        return tok->len;
 }
 
-static size_t response_parse_medium(struct opal_resp_tok *tok,
-                                   const u8 *pos)
+static ssize_t response_parse_medium(struct opal_resp_tok *tok,
+                                    const u8 *pos)
 {
        tok->pos = pos;
        tok->len = (((pos[0] & MEDIUM_ATOM_LEN_MASK) << 8) | pos[1]) + 2;
@@ -770,8 +757,8 @@ static size_t response_parse_medium(struct opal_resp_tok *tok,
        return tok->len;
 }
 
-static size_t response_parse_long(struct opal_resp_tok *tok,
-                                 const u8 *pos)
+static ssize_t response_parse_long(struct opal_resp_tok *tok,
+                                  const u8 *pos)
 {
        tok->pos = pos;
        tok->len = ((pos[1] << 16) | (pos[2] << 8) | pos[3]) + 4;
@@ -787,8 +774,8 @@ static size_t response_parse_long(struct opal_resp_tok *tok,
        return tok->len;
 }
 
-static size_t response_parse_token(struct opal_resp_tok *tok,
-                                  const u8 *pos)
+static ssize_t response_parse_token(struct opal_resp_tok *tok,
+                                   const u8 *pos)
 {
        tok->pos = pos;
        tok->len = 1;
@@ -805,8 +792,9 @@ static int response_parse(const u8 *buf, size_t length,
        struct opal_resp_tok *iter;
        int num_entries = 0;
        int total;
-       size_t token_length;
+       ssize_t token_length;
        const u8 *pos;
+       u32 clen, plen, slen;
 
        if (!buf)
                return -EFAULT;
@@ -818,17 +806,16 @@ static int response_parse(const u8 *buf, size_t length,
        pos = buf;
        pos += sizeof(*hdr);
 
-       pr_debug("Response size: cp: %d, pkt: %d, subpkt: %d\n",
-                be32_to_cpu(hdr->cp.length),
-                be32_to_cpu(hdr->pkt.length),
-                be32_to_cpu(hdr->subpkt.length));
-
-       if (hdr->cp.length == 0 || hdr->pkt.length == 0 ||
-           hdr->subpkt.length == 0) {
-               pr_err("Bad header length. cp: %d, pkt: %d, subpkt: %d\n",
-                      be32_to_cpu(hdr->cp.length),
-                      be32_to_cpu(hdr->pkt.length),
-                      be32_to_cpu(hdr->subpkt.length));
+       clen = be32_to_cpu(hdr->cp.length);
+       plen = be32_to_cpu(hdr->pkt.length);
+       slen = be32_to_cpu(hdr->subpkt.length);
+       pr_debug("Response size: cp: %u, pkt: %u, subpkt: %u\n",
+                clen, plen, slen);
+
+       if (clen == 0 || plen == 0 || slen == 0 ||
+           slen > IO_BUFFER_LENGTH - sizeof(*hdr)) {
+               pr_err("Bad header length. cp: %u, pkt: %u, subpkt: %u\n",
+                      clen, plen, slen);
                print_buffer(pos, sizeof(*hdr));
                return -EINVAL;
        }
@@ -837,7 +824,7 @@ static int response_parse(const u8 *buf, size_t length,
                return -EFAULT;
 
        iter = resp->toks;
-       total = be32_to_cpu(hdr->subpkt.length);
+       total = slen;
        print_buffer(pos, total);
        while (total > 0) {
                if (pos[0] <= TINY_ATOM_BYTE) /* tiny atom */
@@ -851,8 +838,8 @@ static int response_parse(const u8 *buf, size_t length,
                else /* TOKEN */
                        token_length = response_parse_token(iter, pos);
 
-               if (token_length == -EINVAL)
-                       return -EINVAL;
+               if (token_length < 0)
+                       return token_length;
 
                pos += token_length;
                total -= token_length;
@@ -922,20 +909,32 @@ static u64 response_get_u64(const struct parsed_resp *resp, int n)
        return resp->toks[n].stored.u;
 }
 
+static bool response_token_matches(const struct opal_resp_tok *token, u8 match)
+{
+       if (IS_ERR(token) ||
+           token->type != OPAL_DTA_TOKENID_TOKEN ||
+           token->pos[0] != match)
+               return false;
+       return true;
+}
+
 static u8 response_status(const struct parsed_resp *resp)
 {
-       if (token_type(resp, 0) == OPAL_DTA_TOKENID_TOKEN &&
-           response_get_token(resp, 0) == OPAL_ENDOFSESSION) {
+       const struct opal_resp_tok *tok;
+
+       tok = response_get_token(resp, 0);
+       if (response_token_matches(tok, OPAL_ENDOFSESSION))
                return 0;
-       }
 
        if (resp->num < 5)
                return DTAERROR_NO_METHOD_STATUS;
 
-       if (token_type(resp, resp->num - 1) != OPAL_DTA_TOKENID_TOKEN ||
-           token_type(resp, resp->num - 5) != OPAL_DTA_TOKENID_TOKEN ||
-           response_get_token(resp, resp->num - 1) != OPAL_ENDLIST ||
-           response_get_token(resp, resp->num - 5) != OPAL_STARTLIST)
+       tok = response_get_token(resp, resp->num - 5);
+       if (!response_token_matches(tok, OPAL_STARTLIST))
+               return DTAERROR_NO_METHOD_STATUS;
+
+       tok = response_get_token(resp, resp->num - 1);
+       if (!response_token_matches(tok, OPAL_ENDLIST))
                return DTAERROR_NO_METHOD_STATUS;
 
        return response_get_u64(resp, resp->num - 4);
@@ -1022,7 +1021,7 @@ static int finalize_and_send(struct opal_dev *dev, cont_fn cont)
        return opal_send_recv(dev, cont);
 }
 
-static int gen_key(struct opal_dev *dev)
+static int gen_key(struct opal_dev *dev, void *data)
 {
        const u8 *method;
        u8 uid[OPAL_UID_LENGTH];
@@ -1076,15 +1075,14 @@ static int get_active_key_cont(struct opal_dev *dev)
        return 0;
 }
 
-static int get_active_key(struct opal_dev *dev)
+static int get_active_key(struct opal_dev *dev, void *data)
 {
        u8 uid[OPAL_UID_LENGTH];
        int err = 0;
-       u8 *lr;
+       u8 *lr = data;
 
        clear_opal_cmd(dev);
        set_comid(dev, dev->comid);
-       lr = dev->func_data[dev->state];
 
        err = build_locking_range(uid, sizeof(uid), *lr);
        if (err)
@@ -1167,17 +1165,16 @@ static inline int enable_global_lr(struct opal_dev *dev, u8 *uid,
        return err;
 }
 
-static int setup_locking_range(struct opal_dev *dev)
+static int setup_locking_range(struct opal_dev *dev, void *data)
 {
        u8 uid[OPAL_UID_LENGTH];
-       struct opal_user_lr_setup *setup;
+       struct opal_user_lr_setup *setup = data;
        u8 lr;
        int err = 0;
 
        clear_opal_cmd(dev);
        set_comid(dev, dev->comid);
 
-       setup = dev->func_data[dev->state];
        lr = setup->session.opal_key.lr;
        err = build_locking_range(uid, sizeof(uid), lr);
        if (err)
@@ -1290,20 +1287,19 @@ static int start_generic_opal_session(struct opal_dev *dev,
        return finalize_and_send(dev, start_opal_session_cont);
 }
 
-static int start_anybodyASP_opal_session(struct opal_dev *dev)
+static int start_anybodyASP_opal_session(struct opal_dev *dev, void *data)
 {
        return start_generic_opal_session(dev, OPAL_ANYBODY_UID,
                                          OPAL_ADMINSP_UID, NULL, 0);
 }
 
-static int start_SIDASP_opal_session(struct opal_dev *dev)
+static int start_SIDASP_opal_session(struct opal_dev *dev, void *data)
 {
        int ret;
        const u8 *key = dev->prev_data;
-       struct opal_key *okey;
 
        if (!key) {
-               okey = dev->func_data[dev->state];
+               const struct opal_key *okey = data;
                ret = start_generic_opal_session(dev, OPAL_SID_UID,
                                                 OPAL_ADMINSP_UID,
                                                 okey->key,
@@ -1318,22 +1314,21 @@ static int start_SIDASP_opal_session(struct opal_dev *dev)
        return ret;
 }
 
-static inline int start_admin1LSP_opal_session(struct opal_dev *dev)
+static int start_admin1LSP_opal_session(struct opal_dev *dev, void *data)
 {
-       struct opal_key *key = dev->func_data[dev->state];
-
+       struct opal_key *key = data;
        return start_generic_opal_session(dev, OPAL_ADMIN1_UID,
                                          OPAL_LOCKINGSP_UID,
                                          key->key, key->key_len);
 }
 
-static int start_auth_opal_session(struct opal_dev *dev)
+static int start_auth_opal_session(struct opal_dev *dev, void *data)
 {
+       struct opal_session_info *session = data;
        u8 lk_ul_user[OPAL_UID_LENGTH];
+       size_t keylen = session->opal_key.key_len;
        int err = 0;
 
-       struct opal_session_info *session = dev->func_data[dev->state];
-       size_t keylen = session->opal_key.key_len;
        u8 *key = session->opal_key.key;
        u32 hsn = GENERIC_HOST_SESSION_NUM;
 
@@ -1383,7 +1378,7 @@ static int start_auth_opal_session(struct opal_dev *dev)
        return finalize_and_send(dev, start_opal_session_cont);
 }
 
-static int revert_tper(struct opal_dev *dev)
+static int revert_tper(struct opal_dev *dev, void *data)
 {
        int err = 0;
 
@@ -1405,9 +1400,9 @@ static int revert_tper(struct opal_dev *dev)
        return finalize_and_send(dev, parse_and_check_status);
 }
 
-static int internal_activate_user(struct opal_dev *dev)
+static int internal_activate_user(struct opal_dev *dev, void *data)
 {
-       struct opal_session_info *session = dev->func_data[dev->state];
+       struct opal_session_info *session = data;
        u8 uid[OPAL_UID_LENGTH];
        int err = 0;
 
@@ -1440,15 +1435,14 @@ static int internal_activate_user(struct opal_dev *dev)
        return finalize_and_send(dev, parse_and_check_status);
 }
 
-static int erase_locking_range(struct opal_dev *dev)
+static int erase_locking_range(struct opal_dev *dev, void *data)
 {
-       struct opal_session_info *session;
+       struct opal_session_info *session = data;
        u8 uid[OPAL_UID_LENGTH];
        int err = 0;
 
        clear_opal_cmd(dev);
        set_comid(dev, dev->comid);
-       session = dev->func_data[dev->state];
 
        if (build_locking_range(uid, sizeof(uid), session->opal_key.lr) < 0)
                return -ERANGE;
@@ -1467,9 +1461,9 @@ static int erase_locking_range(struct opal_dev *dev)
        return finalize_and_send(dev, parse_and_check_status);
 }
 
-static int set_mbr_done(struct opal_dev *dev)
+static int set_mbr_done(struct opal_dev *dev, void *data)
 {
-       u8 mbr_done_tf = *(u8 *)dev->func_data[dev->state];
+       u8 *mbr_done_tf = data;
        int err = 0;
 
        clear_opal_cmd(dev);
@@ -1485,7 +1479,7 @@ static int set_mbr_done(struct opal_dev *dev)
        add_token_u8(&err, dev, OPAL_STARTLIST);
        add_token_u8(&err, dev, OPAL_STARTNAME);
        add_token_u8(&err, dev, 2); /* Done */
-       add_token_u8(&err, dev, mbr_done_tf); /* Done T or F */
+       add_token_u8(&err, dev, *mbr_done_tf); /* Done T or F */
        add_token_u8(&err, dev, OPAL_ENDNAME);
        add_token_u8(&err, dev, OPAL_ENDLIST);
        add_token_u8(&err, dev, OPAL_ENDNAME);
@@ -1499,9 +1493,9 @@ static int set_mbr_done(struct opal_dev *dev)
        return finalize_and_send(dev, parse_and_check_status);
 }
 
-static int set_mbr_enable_disable(struct opal_dev *dev)
+static int set_mbr_enable_disable(struct opal_dev *dev, void *data)
 {
-       u8 mbr_en_dis = *(u8 *)dev->func_data[dev->state];
+       u8 *mbr_en_dis = data;
        int err = 0;
 
        clear_opal_cmd(dev);
@@ -1517,7 +1511,7 @@ static int set_mbr_enable_disable(struct opal_dev *dev)
        add_token_u8(&err, dev, OPAL_STARTLIST);
        add_token_u8(&err, dev, OPAL_STARTNAME);
        add_token_u8(&err, dev, 1);
-       add_token_u8(&err, dev, mbr_en_dis);
+       add_token_u8(&err, dev, *mbr_en_dis);
        add_token_u8(&err, dev, OPAL_ENDNAME);
        add_token_u8(&err, dev, OPAL_ENDLIST);
        add_token_u8(&err, dev, OPAL_ENDNAME);
@@ -1558,11 +1552,10 @@ static int generic_pw_cmd(u8 *key, size_t key_len, u8 *cpin_uid,
        return err;
 }
 
-static int set_new_pw(struct opal_dev *dev)
+static int set_new_pw(struct opal_dev *dev, void *data)
 {
        u8 cpin_uid[OPAL_UID_LENGTH];
-       struct opal_session_info *usr = dev->func_data[dev->state];
-
+       struct opal_session_info *usr = data;
 
        memcpy(cpin_uid, opaluid[OPAL_C_PIN_ADMIN1], OPAL_UID_LENGTH);
 
@@ -1583,10 +1576,10 @@ static int set_new_pw(struct opal_dev *dev)
        return finalize_and_send(dev, parse_and_check_status);
 }
 
-static int set_sid_cpin_pin(struct opal_dev *dev)
+static int set_sid_cpin_pin(struct opal_dev *dev, void *data)
 {
        u8 cpin_uid[OPAL_UID_LENGTH];
-       struct opal_key *key = dev->func_data[dev->state];
+       struct opal_key *key = data;
 
        memcpy(cpin_uid, opaluid[OPAL_C_PIN_SID], OPAL_UID_LENGTH);
 
@@ -1597,18 +1590,16 @@ static int set_sid_cpin_pin(struct opal_dev *dev)
        return finalize_and_send(dev, parse_and_check_status);
 }
 
-static int add_user_to_lr(struct opal_dev *dev)
+static int add_user_to_lr(struct opal_dev *dev, void *data)
 {
        u8 lr_buffer[OPAL_UID_LENGTH];
        u8 user_uid[OPAL_UID_LENGTH];
-       struct opal_lock_unlock *lkul;
+       struct opal_lock_unlock *lkul = data;
        int err = 0;
 
        clear_opal_cmd(dev);
        set_comid(dev, dev->comid);
 
-       lkul = dev->func_data[dev->state];
-
        memcpy(lr_buffer, opaluid[OPAL_LOCKINGRANGE_ACE_RDLOCKED],
               OPAL_UID_LENGTH);
 
@@ -1675,11 +1666,11 @@ static int add_user_to_lr(struct opal_dev *dev)
        return finalize_and_send(dev, parse_and_check_status);
 }
 
-static int lock_unlock_locking_range(struct opal_dev *dev)
+static int lock_unlock_locking_range(struct opal_dev *dev, void *data)
 {
        u8 lr_buffer[OPAL_UID_LENGTH];
        const u8 *method;
-       struct opal_lock_unlock *lkul;
+       struct opal_lock_unlock *lkul = data;
        u8 read_locked = 1, write_locked = 1;
        int err = 0;
 
@@ -1687,7 +1678,6 @@ static int lock_unlock_locking_range(struct opal_dev *dev)
        set_comid(dev, dev->comid);
 
        method = opalmethod[OPAL_SET];
-       lkul = dev->func_data[dev->state];
        if (build_locking_range(lr_buffer, sizeof(lr_buffer),
                                lkul->session.opal_key.lr) < 0)
                return -ERANGE;
@@ -1739,19 +1729,18 @@ static int lock_unlock_locking_range(struct opal_dev *dev)
 }
 
 
-static int lock_unlock_locking_range_sum(struct opal_dev *dev)
+static int lock_unlock_locking_range_sum(struct opal_dev *dev, void *data)
 {
        u8 lr_buffer[OPAL_UID_LENGTH];
        u8 read_locked = 1, write_locked = 1;
        const u8 *method;
-       struct opal_lock_unlock *lkul;
+       struct opal_lock_unlock *lkul = data;
        int ret;
 
        clear_opal_cmd(dev);
        set_comid(dev, dev->comid);
 
        method = opalmethod[OPAL_SET];
-       lkul = dev->func_data[dev->state];
        if (build_locking_range(lr_buffer, sizeof(lr_buffer),
                                lkul->session.opal_key.lr) < 0)
                return -ERANGE;
@@ -1782,9 +1771,9 @@ static int lock_unlock_locking_range_sum(struct opal_dev *dev)
        return finalize_and_send(dev, parse_and_check_status);
 }
 
-static int activate_lsp(struct opal_dev *dev)
+static int activate_lsp(struct opal_dev *dev, void *data)
 {
-       struct opal_lr_act *opal_act;
+       struct opal_lr_act *opal_act = data;
        u8 user_lr[OPAL_UID_LENGTH];
        u8 uint_3 = 0x83;
        int err = 0, i;
@@ -1792,8 +1781,6 @@ static int activate_lsp(struct opal_dev *dev)
        clear_opal_cmd(dev);
        set_comid(dev, dev->comid);
 
-       opal_act = dev->func_data[dev->state];
-
        add_token_u8(&err, dev, OPAL_CALL);
        add_token_bytestring(&err, dev, opaluid[OPAL_LOCKINGSP_UID],
                             OPAL_UID_LENGTH);
@@ -1858,7 +1845,7 @@ static int get_lsp_lifecycle_cont(struct opal_dev *dev)
 }
 
 /* Determine if we're in the Manufactured Inactive or Active state */
-static int get_lsp_lifecycle(struct opal_dev *dev)
+static int get_lsp_lifecycle(struct opal_dev *dev, void *data)
 {
        int err = 0;
 
@@ -1919,14 +1906,13 @@ static int get_msid_cpin_pin_cont(struct opal_dev *dev)
        return 0;
 }
 
-static int get_msid_cpin_pin(struct opal_dev *dev)
+static int get_msid_cpin_pin(struct opal_dev *dev, void *data)
 {
        int err = 0;
 
        clear_opal_cmd(dev);
        set_comid(dev, dev->comid);
 
-
        add_token_u8(&err, dev, OPAL_CALL);
        add_token_bytestring(&err, dev, opaluid[OPAL_C_PIN_MSID],
                             OPAL_UID_LENGTH);
@@ -1956,64 +1942,76 @@ static int get_msid_cpin_pin(struct opal_dev *dev)
        return finalize_and_send(dev, get_msid_cpin_pin_cont);
 }
 
-static int build_end_opal_session(struct opal_dev *dev)
+static int end_opal_session(struct opal_dev *dev, void *data)
 {
        int err = 0;
 
        clear_opal_cmd(dev);
-
        set_comid(dev, dev->comid);
        add_token_u8(&err, dev, OPAL_ENDOFSESSION);
-       return err;
-}
 
-static int end_opal_session(struct opal_dev *dev)
-{
-       int ret = build_end_opal_session(dev);
-
-       if (ret < 0)
-               return ret;
+       if (err < 0)
+               return err;
        return finalize_and_send(dev, end_session_cont);
 }
 
 static int end_opal_session_error(struct opal_dev *dev)
 {
-       const opal_step error_end_session[] = {
-               end_opal_session,
-               NULL,
+       const struct opal_step error_end_session[] = {
+               { end_opal_session, },
+               { NULL, }
        };
-       dev->funcs = error_end_session;
-       dev->state = 0;
+       dev->steps = error_end_session;
        return next(dev);
 }
 
 static inline void setup_opal_dev(struct opal_dev *dev,
-                                 const opal_step *funcs)
+                                 const struct opal_step *steps)
 {
-       dev->state = 0;
-       dev->funcs = funcs;
+       dev->steps = steps;
        dev->tsn = 0;
        dev->hsn = 0;
-       dev->func_data = NULL;
        dev->prev_data = NULL;
 }
 
 static int check_opal_support(struct opal_dev *dev)
 {
-       static const opal_step funcs[] = {
-               opal_discovery0,
-               NULL
+       const struct opal_step steps[] = {
+               { opal_discovery0, },
+               { NULL, }
        };
        int ret;
 
        mutex_lock(&dev->dev_lock);
-       setup_opal_dev(dev, funcs);
+       setup_opal_dev(dev, steps);
        ret = next(dev);
        dev->supported = !ret;
        mutex_unlock(&dev->dev_lock);
        return ret;
 }
 
+static void clean_opal_dev(struct opal_dev *dev)
+{
+
+       struct opal_suspend_data *suspend, *next;
+
+       mutex_lock(&dev->dev_lock);
+       list_for_each_entry_safe(suspend, next, &dev->unlk_lst, node) {
+               list_del(&suspend->node);
+               kfree(suspend);
+       }
+       mutex_unlock(&dev->dev_lock);
+}
+
+void free_opal_dev(struct opal_dev *dev)
+{
+       if (!dev)
+               return;
+       clean_opal_dev(dev);
+       kfree(dev);
+}
+EXPORT_SYMBOL(free_opal_dev);
+
 struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv)
 {
        struct opal_dev *dev;
@@ -2038,24 +2036,18 @@ EXPORT_SYMBOL(init_opal_dev);
 static int opal_secure_erase_locking_range(struct opal_dev *dev,
                                           struct opal_session_info *opal_session)
 {
-       void *data[3] = { NULL };
-       static const opal_step erase_funcs[] = {
-               opal_discovery0,
-               start_auth_opal_session,
-               get_active_key,
-               gen_key,
-               end_opal_session,
-               NULL,
+       const struct opal_step erase_steps[] = {
+               { opal_discovery0, },
+               { start_auth_opal_session, opal_session },
+               { get_active_key, &opal_session->opal_key.lr },
+               { gen_key, },
+               { end_opal_session, },
+               { NULL, }
        };
        int ret;
 
        mutex_lock(&dev->dev_lock);
-       setup_opal_dev(dev, erase_funcs);
-
-       dev->func_data = data;
-       dev->func_data[1] = opal_session;
-       dev->func_data[2] = &opal_session->opal_key.lr;
-
+       setup_opal_dev(dev, erase_steps);
        ret = next(dev);
        mutex_unlock(&dev->dev_lock);
        return ret;
@@ -2064,23 +2056,17 @@ static int opal_secure_erase_locking_range(struct opal_dev *dev,
 static int opal_erase_locking_range(struct opal_dev *dev,
                                    struct opal_session_info *opal_session)
 {
-       void *data[3] = { NULL };
-       static const opal_step erase_funcs[] = {
-               opal_discovery0,
-               start_auth_opal_session,
-               erase_locking_range,
-               end_opal_session,
-               NULL,
+       const struct opal_step erase_steps[] = {
+               { opal_discovery0, },
+               { start_auth_opal_session, opal_session },
+               { erase_locking_range, opal_session },
+               { end_opal_session, },
+               { NULL, }
        };
        int ret;
 
        mutex_lock(&dev->dev_lock);
-       setup_opal_dev(dev, erase_funcs);
-
-       dev->func_data = data;
-       dev->func_data[1] = opal_session;
-       dev->func_data[2] = opal_session;
-
+       setup_opal_dev(dev, erase_steps);
        ret = next(dev);
        mutex_unlock(&dev->dev_lock);
        return ret;
@@ -2089,16 +2075,15 @@ static int opal_erase_locking_range(struct opal_dev *dev,
 static int opal_enable_disable_shadow_mbr(struct opal_dev *dev,
                                          struct opal_mbr_data *opal_mbr)
 {
-       void *func_data[6] = { NULL };
-       static const opal_step mbr_funcs[] = {
-               opal_discovery0,
-               start_admin1LSP_opal_session,
-               set_mbr_done,
-               end_opal_session,
-               start_admin1LSP_opal_session,
-               set_mbr_enable_disable,
-               end_opal_session,
-               NULL,
+       const struct opal_step mbr_steps[] = {
+               { opal_discovery0, },
+               { start_admin1LSP_opal_session, &opal_mbr->key },
+               { set_mbr_done, &opal_mbr->enable_disable },
+               { end_opal_session, },
+               { start_admin1LSP_opal_session, &opal_mbr->key },
+               { set_mbr_enable_disable, &opal_mbr->enable_disable },
+               { end_opal_session, },
+               { NULL, }
        };
        int ret;
 
@@ -2107,12 +2092,7 @@ static int opal_enable_disable_shadow_mbr(struct opal_dev *dev,
                return -EINVAL;
 
        mutex_lock(&dev->dev_lock);
-       setup_opal_dev(dev, mbr_funcs);
-       dev->func_data = func_data;
-       dev->func_data[1] = &opal_mbr->key;
-       dev->func_data[2] = &opal_mbr->enable_disable;
-       dev->func_data[4] = &opal_mbr->key;
-       dev->func_data[5] = &opal_mbr->enable_disable;
+       setup_opal_dev(dev, mbr_steps);
        ret = next(dev);
        mutex_unlock(&dev->dev_lock);
        return ret;
@@ -2139,13 +2119,12 @@ static int opal_save(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk)
 static int opal_add_user_to_lr(struct opal_dev *dev,
                               struct opal_lock_unlock *lk_unlk)
 {
-       void *func_data[3] = { NULL };
-       static const opal_step funcs[] = {
-               opal_discovery0,
-               start_admin1LSP_opal_session,
-               add_user_to_lr,
-               end_opal_session,
-               NULL
+       const struct opal_step steps[] = {
+               { opal_discovery0, },
+               { start_admin1LSP_opal_session, &lk_unlk->session.opal_key },
+               { add_user_to_lr, lk_unlk },
+               { end_opal_session, },
+               { NULL, }
        };
        int ret;
 
@@ -2167,10 +2146,7 @@ static int opal_add_user_to_lr(struct opal_dev *dev,
        }
 
        mutex_lock(&dev->dev_lock);
-       setup_opal_dev(dev, funcs);
-       dev->func_data = func_data;
-       dev->func_data[1] = &lk_unlk->session.opal_key;
-       dev->func_data[2] = lk_unlk;
+       setup_opal_dev(dev, steps);
        ret = next(dev);
        mutex_unlock(&dev->dev_lock);
        return ret;
@@ -2178,55 +2154,54 @@ static int opal_add_user_to_lr(struct opal_dev *dev,
 
 static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal)
 {
-       void *data[2] = { NULL };
-       static const opal_step revert_funcs[] = {
-               opal_discovery0,
-               start_SIDASP_opal_session,
-               revert_tper, /* controller will terminate session */
-               NULL,
+       const struct opal_step revert_steps[] = {
+               { opal_discovery0, },
+               { start_SIDASP_opal_session, opal },
+               { revert_tper, }, /* controller will terminate session */
+               { NULL, }
        };
        int ret;
 
        mutex_lock(&dev->dev_lock);
-       setup_opal_dev(dev, revert_funcs);
-       dev->func_data = data;
-       dev->func_data[1] = opal;
+       setup_opal_dev(dev, revert_steps);
        ret = next(dev);
        mutex_unlock(&dev->dev_lock);
-       return ret;
-}
 
-static int __opal_lock_unlock_sum(struct opal_dev *dev)
-{
-       static const opal_step ulk_funcs_sum[] = {
-               opal_discovery0,
-               start_auth_opal_session,
-               lock_unlock_locking_range_sum,
-               end_opal_session,
-               NULL
-       };
+       /*
+        * If we successfully reverted lets clean
+        * any saved locking ranges.
+        */
+       if (!ret)
+               clean_opal_dev(dev);
 
-       dev->funcs = ulk_funcs_sum;
-       return next(dev);
+       return ret;
 }
 
-static int __opal_lock_unlock(struct opal_dev *dev)
+static int __opal_lock_unlock(struct opal_dev *dev,
+                             struct opal_lock_unlock *lk_unlk)
 {
-       static const opal_step _unlock_funcs[] = {
-               opal_discovery0,
-               start_auth_opal_session,
-               lock_unlock_locking_range,
-               end_opal_session,
-               NULL
+       const struct opal_step unlock_steps[] = {
+               { opal_discovery0, },
+               { start_auth_opal_session, &lk_unlk->session },
+               { lock_unlock_locking_range, lk_unlk },
+               { end_opal_session, },
+               { NULL, }
+       };
+       const struct opal_step unlock_sum_steps[] = {
+               { opal_discovery0, },
+               { start_auth_opal_session, &lk_unlk->session },
+               { lock_unlock_locking_range_sum, lk_unlk },
+               { end_opal_session, },
+               { NULL, }
        };
 
-       dev->funcs = _unlock_funcs;
+       dev->steps = lk_unlk->session.sum ? unlock_sum_steps : unlock_steps;
        return next(dev);
 }
 
-static int opal_lock_unlock(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk)
+static int opal_lock_unlock(struct opal_dev *dev,
+                           struct opal_lock_unlock *lk_unlk)
 {
-       void *func_data[3] = { NULL };
        int ret;
 
        if (lk_unlk->session.who < OPAL_ADMIN1 ||
@@ -2234,43 +2209,30 @@ static int opal_lock_unlock(struct opal_dev *dev, struct opal_lock_unlock *lk_un
                return -EINVAL;
 
        mutex_lock(&dev->dev_lock);
-       setup_opal_dev(dev, NULL);
-       dev->func_data = func_data;
-       dev->func_data[1] = &lk_unlk->session;
-       dev->func_data[2] = lk_unlk;
-
-       if (lk_unlk->session.sum)
-               ret = __opal_lock_unlock_sum(dev);
-       else
-               ret = __opal_lock_unlock(dev);
-
+       ret = __opal_lock_unlock(dev, lk_unlk);
        mutex_unlock(&dev->dev_lock);
        return ret;
 }
 
 static int opal_take_ownership(struct opal_dev *dev, struct opal_key *opal)
 {
-       static const opal_step owner_funcs[] = {
-               opal_discovery0,
-               start_anybodyASP_opal_session,
-               get_msid_cpin_pin,
-               end_opal_session,
-               start_SIDASP_opal_session,
-               set_sid_cpin_pin,
-               end_opal_session,
-               NULL
+       const struct opal_step owner_steps[] = {
+               { opal_discovery0, },
+               { start_anybodyASP_opal_session, },
+               { get_msid_cpin_pin, },
+               { end_opal_session, },
+               { start_SIDASP_opal_session, opal },
+               { set_sid_cpin_pin, opal },
+               { end_opal_session, },
+               { NULL, }
        };
-       void *data[6] = { NULL };
        int ret;
 
        if (!dev)
                return -ENODEV;
 
        mutex_lock(&dev->dev_lock);
-       setup_opal_dev(dev, owner_funcs);
-       dev->func_data = data;
-       dev->func_data[4] = opal;
-       dev->func_data[5] = opal;
+       setup_opal_dev(dev, owner_steps);
        ret = next(dev);
        mutex_unlock(&dev->dev_lock);
        return ret;
@@ -2278,14 +2240,13 @@ static int opal_take_ownership(struct opal_dev *dev, struct opal_key *opal)
 
 static int opal_activate_lsp(struct opal_dev *dev, struct opal_lr_act *opal_lr_act)
 {
-       void *data[4] = { NULL };
-       static const opal_step active_funcs[] = {
-               opal_discovery0,
-               start_SIDASP_opal_session, /* Open session as SID auth */
-               get_lsp_lifecycle,
-               activate_lsp,
-               end_opal_session,
-               NULL
+       const struct opal_step active_steps[] = {
+               { opal_discovery0, },
+               { start_SIDASP_opal_session, &opal_lr_act->key },
+               { get_lsp_lifecycle, },
+               { activate_lsp, opal_lr_act },
+               { end_opal_session, },
+               { NULL, }
        };
        int ret;
 
@@ -2293,10 +2254,7 @@ static int opal_activate_lsp(struct opal_dev *dev, struct opal_lr_act *opal_lr_a
                return -EINVAL;
 
        mutex_lock(&dev->dev_lock);
-       setup_opal_dev(dev, active_funcs);
-       dev->func_data = data;
-       dev->func_data[1] = &opal_lr_act->key;
-       dev->func_data[3] = opal_lr_act;
+       setup_opal_dev(dev, active_steps);
        ret = next(dev);
        mutex_unlock(&dev->dev_lock);
        return ret;
@@ -2305,21 +2263,17 @@ static int opal_activate_lsp(struct opal_dev *dev, struct opal_lr_act *opal_lr_a
 static int opal_setup_locking_range(struct opal_dev *dev,
                                    struct opal_user_lr_setup *opal_lrs)
 {
-       void *data[3] = { NULL };
-       static const opal_step lr_funcs[] = {
-               opal_discovery0,
-               start_auth_opal_session,
-               setup_locking_range,
-               end_opal_session,
-               NULL,
+       const struct opal_step lr_steps[] = {
+               { opal_discovery0, },
+               { start_auth_opal_session, &opal_lrs->session },
+               { setup_locking_range, opal_lrs },
+               { end_opal_session, },
+               { NULL, }
        };
        int ret;
 
        mutex_lock(&dev->dev_lock);
-       setup_opal_dev(dev, lr_funcs);
-       dev->func_data = data;
-       dev->func_data[1] = &opal_lrs->session;
-       dev->func_data[2] = opal_lrs;
+       setup_opal_dev(dev, lr_steps);
        ret = next(dev);
        mutex_unlock(&dev->dev_lock);
        return ret;
@@ -2327,14 +2281,13 @@ static int opal_setup_locking_range(struct opal_dev *dev,
 
 static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
 {
-       static const opal_step pw_funcs[] = {
-               opal_discovery0,
-               start_auth_opal_session,
-               set_new_pw,
-               end_opal_session,
-               NULL
+       const struct opal_step pw_steps[] = {
+               { opal_discovery0, },
+               { start_auth_opal_session, &opal_pw->session },
+               { set_new_pw, &opal_pw->new_user_pw },
+               { end_opal_session, },
+               { NULL }
        };
-       void *data[3] = { NULL };
        int ret;
 
        if (opal_pw->session.who < OPAL_ADMIN1 ||
@@ -2344,11 +2297,7 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
                return -EINVAL;
 
        mutex_lock(&dev->dev_lock);
-       setup_opal_dev(dev, pw_funcs);
-       dev->func_data = data;
-       dev->func_data[1] = (void *) &opal_pw->session;
-       dev->func_data[2] = (void *) &opal_pw->new_user_pw;
-
+       setup_opal_dev(dev, pw_steps);
        ret = next(dev);
        mutex_unlock(&dev->dev_lock);
        return ret;
@@ -2357,14 +2306,13 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
 static int opal_activate_user(struct opal_dev *dev,
                              struct opal_session_info *opal_session)
 {
-       static const opal_step act_funcs[] = {
-               opal_discovery0,
-               start_admin1LSP_opal_session,
-               internal_activate_user,
-               end_opal_session,
-               NULL
+       const struct opal_step act_steps[] = {
+               { opal_discovery0, },
+               { start_admin1LSP_opal_session, &opal_session->opal_key },
+               { internal_activate_user, opal_session },
+               { end_opal_session, },
+               { NULL, }
        };
-       void *data[3] = { NULL };
        int ret;
 
        /* We can't activate Admin1 it's active as manufactured */
@@ -2375,10 +2323,7 @@ static int opal_activate_user(struct opal_dev *dev,
        }
 
        mutex_lock(&dev->dev_lock);
-       setup_opal_dev(dev, act_funcs);
-       dev->func_data = data;
-       dev->func_data[1] = &opal_session->opal_key;
-       dev->func_data[2] = opal_session;
+       setup_opal_dev(dev, act_steps);
        ret = next(dev);
        mutex_unlock(&dev->dev_lock);
        return ret;
@@ -2387,7 +2332,6 @@ static int opal_activate_user(struct opal_dev *dev,
 bool opal_unlock_from_suspend(struct opal_dev *dev)
 {
        struct opal_suspend_data *suspend;
-       void *func_data[3] = { NULL };
        bool was_failure = false;
        int ret = 0;
 
@@ -2398,19 +2342,12 @@ bool opal_unlock_from_suspend(struct opal_dev *dev)
 
        mutex_lock(&dev->dev_lock);
        setup_opal_dev(dev, NULL);
-       dev->func_data = func_data;
 
        list_for_each_entry(suspend, &dev->unlk_lst, node) {
-               dev->state = 0;
-               dev->func_data[1] = &suspend->unlk.session;
-               dev->func_data[2] = &suspend->unlk;
                dev->tsn = 0;
                dev->hsn = 0;
 
-               if (suspend->unlk.session.sum)
-                       ret = __opal_lock_unlock_sum(dev);
-               else
-                       ret = __opal_lock_unlock(dev);
+               ret = __opal_lock_unlock(dev, &suspend->unlk);
                if (ret) {
                        pr_warn("Failed to unlock LR %hhu with sum %d\n",
                                suspend->unlk.session.opal_key.lr,
@@ -2437,7 +2374,7 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
                return -ENOTSUPP;
        }
 
-       p = memdup_user(arg,  _IOC_SIZE(cmd));
+       p = memdup_user(arg, _IOC_SIZE(cmd));
        if (IS_ERR(p))
                return PTR_ERR(p);
 
index a18de9d727b096cac76b14c64cb6991608d02c30..01a1f7e249782499e66712ccf3abd854344bc5dc 100644 (file)
  *    02111-1307, USA.
  *
  *    Questions/Comments/Bugfixes to iss_storagedev@hp.com
- *    
+ *
  *    Author: Stephen M. Cameron
  */
 #ifdef CONFIG_CISS_SCSI_TAPE
 
-/* Here we have code to present the driver as a scsi driver 
-   as it is simultaneously presented as a block driver.  The 
+/* Here we have code to present the driver as a scsi driver
+   as it is simultaneously presented as a block driver.  The
    reason for doing this is to allow access to SCSI tape drives
-   through the array controller.  Note in particular, neither 
+   through the array controller.  Note in particular, neither
    physical nor logical disks are presented through the scsi layer. */
 
 #include <linux/timer.h>
@@ -37,7 +37,7 @@
 
 #include <scsi/scsi_cmnd.h>
 #include <scsi/scsi_device.h>
-#include <scsi/scsi_host.h> 
+#include <scsi/scsi_host.h>
 
 #include "cciss_scsi.h"
 
@@ -120,7 +120,7 @@ struct cciss_scsi_adapter_data_t {
        struct cciss_scsi_cmd_stack_t cmd_stack;
        SGDescriptor_struct **cmd_sg_list;
        int registered;
-       spinlock_t lock; // to protect ccissscsi[ctlr]; 
+       spinlock_t lock; // to protect ccissscsi[ctlr];
 };
 
 #define CPQ_TAPE_LOCK(h, flags) spin_lock_irqsave( \
@@ -143,36 +143,36 @@ scsi_cmd_alloc(ctlr_info_t *h)
        u64bit temp64;
 
        sa = h->scsi_ctlr;
-       stk = &sa->cmd_stack; 
+       stk = &sa->cmd_stack;
 
-       if (stk->top < 0) 
+       if (stk->top < 0)
                return NULL;
-       c = stk->elem[stk->top];        
+       c = stk->elem[stk->top];
        /* memset(c, 0, sizeof(*c)); */
        memset(&c->cmd, 0, sizeof(c->cmd));
        memset(&c->Err, 0, sizeof(c->Err));
        /* set physical addr of cmd and addr of scsi parameters */
-       c->cmd.busaddr = c->busaddr; 
+       c->cmd.busaddr = c->busaddr;
        c->cmd.cmdindex = c->cmdindex;
-       /* (__u32) (stk->cmd_pool_handle + 
+       /* (__u32) (stk->cmd_pool_handle +
                (sizeof(struct cciss_scsi_cmd_stack_elem_t)*stk->top)); */
 
        temp64.val = (__u64) (c->busaddr + sizeof(CommandList_struct));
-       /* (__u64) (stk->cmd_pool_handle + 
+       /* (__u64) (stk->cmd_pool_handle +
                (sizeof(struct cciss_scsi_cmd_stack_elem_t)*stk->top) +
                 sizeof(CommandList_struct)); */
        stk->top--;
        c->cmd.ErrDesc.Addr.lower = temp64.val32.lower;
        c->cmd.ErrDesc.Addr.upper = temp64.val32.upper;
        c->cmd.ErrDesc.Len = sizeof(ErrorInfo_struct);
-       
+
        c->cmd.ctlr = h->ctlr;
        c->cmd.err_info = &c->Err;
 
        return (CommandList_struct *) c;
 }
 
-static void 
+static void
 scsi_cmd_free(ctlr_info_t *h, CommandList_struct *c)
 {
        /* assume only one process in here at a time, locking done by caller. */
@@ -183,7 +183,7 @@ scsi_cmd_free(ctlr_info_t *h, CommandList_struct *c)
        struct cciss_scsi_cmd_stack_t *stk;
 
        sa = h->scsi_ctlr;
-       stk = &sa->cmd_stack; 
+       stk = &sa->cmd_stack;
        stk->top++;
        if (stk->top >= stk->nelems) {
                dev_err(&h->pdev->dev,
@@ -228,7 +228,7 @@ scsi_cmd_stack_setup(ctlr_info_t *h, struct cciss_scsi_adapter_data_t *sa)
        }
        for (i = 0; i < stk->nelems; i++) {
                stk->elem[i] = &stk->pool[i];
-               stk->elem[i]->busaddr = (__u32) (stk->cmd_pool_handle + 
+               stk->elem[i]->busaddr = (__u32) (stk->cmd_pool_handle +
                        (sizeof(struct cciss_scsi_cmd_stack_elem_t) * i));
                stk->elem[i]->cmdindex = i;
        }
@@ -244,7 +244,7 @@ scsi_cmd_stack_free(ctlr_info_t *h)
        size_t size;
 
        sa = h->scsi_ctlr;
-       stk = &sa->cmd_stack; 
+       stk = &sa->cmd_stack;
        if (stk->top != stk->nelems-1) {
                dev_warn(&h->pdev->dev,
                        "bug: %d scsi commands are still outstanding.\n",
@@ -266,7 +266,7 @@ print_cmd(CommandList_struct *cp)
        printk("queue:%d\n", cp->Header.ReplyQueue);
        printk("sglist:%d\n", cp->Header.SGList);
        printk("sgtot:%d\n", cp->Header.SGTotal);
-       printk("Tag:0x%08x/0x%08x\n", cp->Header.Tag.upper, 
+       printk("Tag:0x%08x/0x%08x\n", cp->Header.Tag.upper,
                        cp->Header.Tag.lower);
        printk("LUN:0x%8phN\n", cp->Header.LUN.LunAddrBytes);
        printk("CDBLen:%d\n", cp->Request.CDBLen);
@@ -275,8 +275,8 @@ print_cmd(CommandList_struct *cp)
        printk(" Dir:%d\n",cp->Request.Type.Direction);
        printk("Timeout:%d\n",cp->Request.Timeout);
        printk("CDB: %16ph\n", cp->Request.CDB);
-       printk("edesc.Addr: 0x%08x/0%08x, Len  = %d\n", 
-               cp->ErrDesc.Addr.upper, cp->ErrDesc.Addr.lower, 
+       printk("edesc.Addr: 0x%08x/0%08x, Len  = %d\n",
+               cp->ErrDesc.Addr.upper, cp->ErrDesc.Addr.lower,
                        cp->ErrDesc.Len);
        printk("sgs..........Errorinfo:\n");
        printk("scsistatus:%d\n", cp->err_info->ScsiStatus);
@@ -289,7 +289,7 @@ print_cmd(CommandList_struct *cp)
 }
 #endif
 
-static int 
+static int
 find_bus_target_lun(ctlr_info_t *h, int *bus, int *target, int *lun)
 {
        /* finds an unused bus, target, lun for a new device */
@@ -299,24 +299,24 @@ find_bus_target_lun(ctlr_info_t *h, int *bus, int *target, int *lun)
 
        memset(&target_taken[0], 0, CCISS_MAX_SCSI_DEVS_PER_HBA);
 
-       target_taken[SELF_SCSI_ID] = 1; 
+       target_taken[SELF_SCSI_ID] = 1;
        for (i = 0; i < ccissscsi[h->ctlr].ndevices; i++)
                target_taken[ccissscsi[h->ctlr].dev[i].target] = 1;
-       
+
        for (i = 0; i < CCISS_MAX_SCSI_DEVS_PER_HBA; i++) {
                if (!target_taken[i]) {
                        *bus = 0; *target=i; *lun = 0; found=1;
                        break;
                }
        }
-       return (!found);        
+       return (!found);
 }
 struct scsi2map {
        char scsi3addr[8];
        int bus, target, lun;
 };
 
-static int 
+static int
 cciss_scsi_add_entry(ctlr_info_t *h, int hostno,
                struct cciss_scsi_dev_t *device,
                struct scsi2map *added, int *nadded)
@@ -381,8 +381,8 @@ cciss_scsi_add_entry(ctlr_info_t *h, int hostno,
 
        ccissscsi[h->ctlr].ndevices++;
 
-       /* initially, (before registering with scsi layer) we don't 
-          know our hostno and we don't want to print anything first 
+       /* initially, (before registering with scsi layer) we don't
+          know our hostno and we don't want to print anything first
           time anyway (the scsi layer's inquiries will show that info) */
        if (hostno != -1)
                dev_info(&h->pdev->dev, "%s device c%db%dt%dl%d added.\n",
@@ -467,7 +467,7 @@ adjust_cciss_scsi_table(ctlr_info_t *h, int hostno,
        /* sd contains scsi3 addresses and devtypes, but
           bus target and lun are not filled in.  This funciton
           takes what's in sd to be the current and adjusts
-          ccissscsi[] to be in line with what's in sd. */ 
+          ccissscsi[] to be in line with what's in sd. */
 
        int i,j, found, changes=0;
        struct cciss_scsi_dev_t *csd;
@@ -492,7 +492,7 @@ adjust_cciss_scsi_table(ctlr_info_t *h, int hostno,
        if (hostno != -1)  /* if it's not the first time... */
                sh = h->scsi_ctlr->scsi_host;
 
-       /* find any devices in ccissscsi[] that are not in 
+       /* find any devices in ccissscsi[] that are not in
           sd[] and remove them from ccissscsi[] */
 
        i = 0;
@@ -512,7 +512,7 @@ adjust_cciss_scsi_table(ctlr_info_t *h, int hostno,
                        }
                }
 
-               if (found == 0) { /* device no longer present. */ 
+               if (found == 0) { /* device no longer present. */
                        changes++;
                        cciss_scsi_remove_entry(h, hostno, i,
                                removed, &nremoved);
@@ -641,14 +641,13 @@ lookup_scsi3addr(ctlr_info_t *h, int bus, int target, int lun, char *scsi3addr)
        return -1;
 }
 
-static void 
+static void
 cciss_scsi_setup(ctlr_info_t *h)
 {
        struct cciss_scsi_adapter_data_t * shba;
 
        ccissscsi[h->ctlr].ndevices = 0;
-       shba = (struct cciss_scsi_adapter_data_t *)
-               kmalloc(sizeof(*shba), GFP_KERNEL);     
+       shba = kmalloc(sizeof(*shba), GFP_KERNEL);
        if (shba == NULL)
                return;
        shba->scsi_host = NULL;
@@ -693,20 +692,18 @@ static void complete_scsi_command(CommandList_struct *c, int timeout,
 
        /* copy the sense data whether we need to or not. */
 
-       memcpy(cmd->sense_buffer, ei->SenseInfo, 
+       memcpy(cmd->sense_buffer, ei->SenseInfo,
                ei->SenseLen > SCSI_SENSE_BUFFERSIZE ?
-                       SCSI_SENSE_BUFFERSIZE : 
+                       SCSI_SENSE_BUFFERSIZE :
                        ei->SenseLen);
        scsi_set_resid(cmd, ei->ResidualCnt);
 
-       if(ei->CommandStatus != 0) 
-       { /* an error has occurred */ 
-               switch(ei->CommandStatus)
-               {
+       if (ei->CommandStatus != 0) { /* an error has occurred */
+               switch (ei->CommandStatus) {
                        case CMD_TARGET_STATUS:
                                /* Pass it up to the upper layers... */
                                if (!ei->ScsiStatus) {
-                                       
+
        /* Ordinarily, this case should never happen, but there is a bug
           in some released firmware revisions that allows it to happen
           if, for example, a 4100 backplane loses power and the tape
@@ -731,7 +728,7 @@ static void complete_scsi_command(CommandList_struct *c, int timeout,
                                print_cmd(c);
                                 */
      /* We get CMD_INVALID if you address a non-existent tape drive instead
-       of a selection timeout (no response).  You will see this if you yank 
+       of a selection timeout (no response).  You will see this if you yank
        out a tape drive, then try to access it. This is kind of a shame
        because it means that any other CMD_INVALID (e.g. driver bug) will
        get interpreted as a missing target. */
@@ -780,7 +777,7 @@ static void complete_scsi_command(CommandList_struct *c, int timeout,
                                cmd->result = DID_ERROR << 16;
                                dev_warn(&h->pdev->dev,
                                        "%p returned unknown status %x\n", c,
-                                               ei->CommandStatus); 
+                                               ei->CommandStatus);
                }
        }
        cmd->scsi_done(cmd);
@@ -796,15 +793,15 @@ cciss_scsi_detect(ctlr_info_t *h)
        sh = scsi_host_alloc(&cciss_driver_template, sizeof(struct ctlr_info *));
        if (sh == NULL)
                goto fail;
-       sh->io_port = 0;        // good enough?  FIXME, 
+       sh->io_port = 0;        // good enough?  FIXME,
        sh->n_io_port = 0;      // I don't think we use these two...
-       sh->this_id = SELF_SCSI_ID;  
+       sh->this_id = SELF_SCSI_ID;
        sh->can_queue = cciss_tape_cmds;
        sh->sg_tablesize = h->maxsgentries;
        sh->max_cmd_len = MAX_COMMAND_SIZE;
        sh->max_sectors = h->cciss_max_sectors;
 
-       ((struct cciss_scsi_adapter_data_t *) 
+       ((struct cciss_scsi_adapter_data_t *)
                h->scsi_ctlr)->scsi_host = sh;
        sh->hostdata[0] = (unsigned long) h;
        sh->irq = h->intr[SIMPLE_MODE_INT];
@@ -856,7 +853,7 @@ cciss_map_one(struct pci_dev *pdev,
 static int
 cciss_scsi_do_simple_cmd(ctlr_info_t *h,
                        CommandList_struct *c,
-                       unsigned char *scsi3addr, 
+                       unsigned char *scsi3addr,
                        unsigned char *cdb,
                        unsigned char cdblen,
                        unsigned char *buf, int bufsize,
@@ -871,7 +868,7 @@ cciss_scsi_do_simple_cmd(ctlr_info_t *h,
        c->Header.Tag.lower = c->busaddr;  /* Use k. address of cmd as tag */
        // Fill in the request block...
 
-       /* printk("Using scsi3addr 0x%02x%0x2%0x2%0x2%0x2%0x2%0x2%0x2\n", 
+       /* printk("Using scsi3addr 0x%02x%0x2%0x2%0x2%0x2%0x2%0x2%0x2\n",
                scsi3addr[0], scsi3addr[1], scsi3addr[2], scsi3addr[3],
                scsi3addr[4], scsi3addr[5], scsi3addr[6], scsi3addr[7]); */
 
@@ -885,7 +882,7 @@ cciss_scsi_do_simple_cmd(ctlr_info_t *h,
 
        /* Fill in the SG list and do dma mapping */
        cciss_map_one(h->pdev, c, (unsigned char *) buf,
-                       bufsize, DMA_FROM_DEVICE); 
+                       bufsize, DMA_FROM_DEVICE);
 
        c->waiting = &wait;
        enqueue_cmd_and_start_io(h, c);
@@ -896,14 +893,13 @@ cciss_scsi_do_simple_cmd(ctlr_info_t *h,
        return(0);
 }
 
-static void 
+static void
 cciss_scsi_interpret_error(ctlr_info_t *h, CommandList_struct *c)
 {
        ErrorInfo_struct *ei;
 
        ei = c->err_info;
-       switch(ei->CommandStatus)
-       {
+       switch (ei->CommandStatus) {
                case CMD_TARGET_STATUS:
                        dev_warn(&h->pdev->dev,
                                "cmd %p has completed with errors\n", c);
@@ -1005,7 +1001,7 @@ cciss_scsi_do_inquiry(ctlr_info_t *h, unsigned char *scsi3addr,
 
        if (rc != 0) return rc; /* something went wrong */
 
-       if (ei->CommandStatus != 0 && 
+       if (ei->CommandStatus != 0 &&
            ei->CommandStatus != CMD_DATA_UNDERRUN) {
                cciss_scsi_interpret_error(h, c);
                rc = -1;
@@ -1013,7 +1009,7 @@ cciss_scsi_do_inquiry(ctlr_info_t *h, unsigned char *scsi3addr,
        spin_lock_irqsave(&h->lock, flags);
        scsi_cmd_free(h, c);
        spin_unlock_irqrestore(&h->lock, flags);
-       return rc;      
+       return rc;
 }
 
 /* Get the device id from inquiry page 0x83 */
@@ -1042,7 +1038,7 @@ cciss_scsi_do_report_phys_luns(ctlr_info_t *h,
        int rc;
        CommandList_struct *c;
        unsigned char cdb[12];
-       unsigned char scsi3addr[8]; 
+       unsigned char scsi3addr[8];
        ErrorInfo_struct *ei;
        unsigned long flags;
 
@@ -1069,14 +1065,14 @@ cciss_scsi_do_report_phys_luns(ctlr_info_t *h,
        cdb[11] = 0;
 
        rc = cciss_scsi_do_simple_cmd(h, c, scsi3addr,
-                               cdb, 12, 
-                               (unsigned char *) buf, 
+                               cdb, 12,
+                               (unsigned char *) buf,
                                bufsize, XFER_READ);
 
        if (rc != 0) return rc; /* something went wrong */
 
        ei = c->err_info;
-       if (ei->CommandStatus != 0 && 
+       if (ei->CommandStatus != 0 &&
            ei->CommandStatus != CMD_DATA_UNDERRUN) {
                cciss_scsi_interpret_error(h, c);
                rc = -1;
@@ -1084,36 +1080,36 @@ cciss_scsi_do_report_phys_luns(ctlr_info_t *h,
        spin_lock_irqsave(&h->lock, flags);
        scsi_cmd_free(h, c);
        spin_unlock_irqrestore(&h->lock, flags);
-       return rc;      
+       return rc;
 }
 
 static void
 cciss_update_non_disk_devices(ctlr_info_t *h, int hostno)
 {
        /* the idea here is we could get notified from /proc
-          that some devices have changed, so we do a report 
-          physical luns cmd, and adjust our list of devices 
+          that some devices have changed, so we do a report
+          physical luns cmd, and adjust our list of devices
           accordingly.  (We can't rely on the scsi-mid layer just
-          doing inquiries, because the "busses" that the scsi 
+          doing inquiries, because the "busses" that the scsi
           mid-layer probes are totally fabricated by this driver,
           so new devices wouldn't show up.
 
-          the scsi3addr's of devices won't change so long as the 
-          adapter is not reset.  That means we can rescan and 
-          tell which devices we already know about, vs. new 
+          the scsi3addr's of devices won't change so long as the
+          adapter is not reset.  That means we can rescan and
+          tell which devices we already know about, vs. new
           devices, vs.  disappearing devices.
 
           Also, if you yank out a tape drive, then put in a disk
-          in it's place, (say, a configured volume from another 
-          array controller for instance)  _don't_ poke this driver 
-           (so it thinks it's still a tape, but _do_ poke the scsi 
-           mid layer, so it does an inquiry... the scsi mid layer 
+          in it's place, (say, a configured volume from another
+          array controller for instance)  _don't_ poke this driver
+           (so it thinks it's still a tape, but _do_ poke the scsi
+           mid layer, so it does an inquiry... the scsi mid layer
            will see the physical disk.  This would be bad.  Need to
-          think about how to prevent that.  One idea would be to 
+          think about how to prevent that.  One idea would be to
           snoop all scsi responses and if an inquiry repsonse comes
           back that reports a disk, chuck it an return selection
           timeout instead and adjust our table...  Not sure i like
-          that though.  
+          that though.
 
         */
 #define OBDR_TAPE_INQ_SIZE 49
@@ -1141,9 +1137,9 @@ cciss_update_non_disk_devices(ctlr_info_t *h, int hostno)
                ch = &ld_buff->LUNListLength[0];
                num_luns = ((ch[0]<<24) | (ch[1]<<16) | (ch[2]<<8) | ch[3]) / 8;
                if (num_luns > CISS_MAX_PHYS_LUN) {
-                       printk(KERN_WARNING 
+                       printk(KERN_WARNING
                                "cciss: Maximum physical LUNs (%d) exceeded.  "
-                               "%d LUNs ignored.\n", CISS_MAX_PHYS_LUN, 
+                               "%d LUNs ignored.\n", CISS_MAX_PHYS_LUN,
                                num_luns - CISS_MAX_PHYS_LUN);
                        num_luns = CISS_MAX_PHYS_LUN;
                }
@@ -1154,7 +1150,7 @@ cciss_update_non_disk_devices(ctlr_info_t *h, int hostno)
        }
 
 
-       /* adjust our table of devices */       
+       /* adjust our table of devices */
        for (i = 0; i < num_luns; i++) {
                /* for each physical lun, do an inquiry */
                if (ld_buff->LUN[i][3] & 0xC0) continue;
@@ -1182,8 +1178,7 @@ cciss_update_non_disk_devices(ctlr_info_t *h, int hostno)
                cciss_scsi_get_device_id(h, scsi3addr,
                        this_device->device_id, sizeof(this_device->device_id));
 
-               switch (this_device->devtype)
-               {
+               switch (this_device->devtype) {
                  case 0x05: /* CD-ROM */ {
 
                        /* We don't *really* support actual CD-ROM devices,
@@ -1213,7 +1208,7 @@ cciss_update_non_disk_devices(ctlr_info_t *h, int hostno)
                        currentsd[ncurrent] = *this_device;
                        ncurrent++;
                        break;
-                 default: 
+                 default:
                        break;
                }
        }
@@ -1258,8 +1253,8 @@ cciss_scsi_write_info(struct Scsi_Host *sh,
                return -EINVAL;
 
        return cciss_scsi_user_command(h, sh->host_no,
-                       buffer, length);        
-} 
+                       buffer, length);
+}
 
 static int
 cciss_scsi_show_info(struct seq_file *m, struct Scsi_Host *sh)
@@ -1297,8 +1292,8 @@ cciss_scsi_show_info(struct seq_file *m, struct Scsi_Host *sh)
        return 0;
 }
 
-/* cciss_scatter_gather takes a struct scsi_cmnd, (cmd), and does the pci 
-   dma mapping  and fills in the scatter gather entries of the 
+/* cciss_scatter_gather takes a struct scsi_cmnd, (cmd), and does the pci
+   dma mapping  and fills in the scatter gather entries of the
    cciss command, c. */
 
 static void cciss_scatter_gather(ctlr_info_t *h, CommandList_struct *c,
@@ -1394,7 +1389,7 @@ cciss_scsi_queue_command_lck(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmn
 
        // Fill in the command list header
 
-       cmd->scsi_done = done;    // save this for use by completion code 
+       cmd->scsi_done = done;    // save this for use by completion code
 
        /* save c in case we have to abort it */
        cmd->host_scribble = (unsigned char *) c;
@@ -1404,7 +1399,7 @@ cciss_scsi_queue_command_lck(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmn
        c->Header.ReplyQueue = 0;  /* unused in simple mode */
        memcpy(&c->Header.LUN.LunAddrBytes[0], &scsi3addr[0], 8);
        c->Header.Tag.lower = c->busaddr;  /* Use k. address of cmd as tag */
-       
+
        // Fill in the request block...
 
        c->Request.Timeout = 0;
@@ -1414,8 +1409,7 @@ cciss_scsi_queue_command_lck(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmn
        memcpy(c->Request.CDB, cmd->cmnd, cmd->cmd_len);
        c->Request.Type.Type = TYPE_CMD;
        c->Request.Type.Attribute = ATTR_SIMPLE;
-       switch(cmd->sc_data_direction)
-       {
+       switch (cmd->sc_data_direction) {
          case DMA_TO_DEVICE:
                c->Request.Type.Direction = XFER_WRITE;
                break;
@@ -1432,15 +1426,15 @@ cciss_scsi_queue_command_lck(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmn
 
                c->Request.Type.Direction = XFER_RSVD;
                // This is technically wrong, and cciss controllers should
-               // reject it with CMD_INVALID, which is the most correct 
-               // response, but non-fibre backends appear to let it 
+               // reject it with CMD_INVALID, which is the most correct
+               // response, but non-fibre backends appear to let it
                // slide by, and give the same results as if this field
                // were set correctly.  Either way is acceptable for
                // our purposes here.
 
                break;
 
-         default: 
+         default:
                dev_warn(&h->pdev->dev, "unknown data direction: %d\n",
                        cmd->sc_data_direction);
                BUG();
@@ -1464,9 +1458,9 @@ static void cciss_unregister_scsi(ctlr_info_t *h)
 
        spin_lock_irqsave(&h->lock, flags);
        sa = h->scsi_ctlr;
-       stk = &sa->cmd_stack; 
+       stk = &sa->cmd_stack;
 
-       /* if we weren't ever actually registered, don't unregister */ 
+       /* if we weren't ever actually registered, don't unregister */
        if (sa->registered) {
                spin_unlock_irqrestore(&h->lock, flags);
                scsi_remove_host(sa->scsi_host);
@@ -1474,7 +1468,7 @@ static void cciss_unregister_scsi(ctlr_info_t *h)
                spin_lock_irqsave(&h->lock, flags);
        }
 
-       /* set scsi_host to NULL so our detect routine will 
+       /* set scsi_host to NULL so our detect routine will
           find us on register */
        sa->scsi_host = NULL;
        spin_unlock_irqrestore(&h->lock, flags);
@@ -1490,7 +1484,7 @@ static int cciss_engage_scsi(ctlr_info_t *h)
 
        spin_lock_irqsave(&h->lock, flags);
        sa = h->scsi_ctlr;
-       stk = &sa->cmd_stack; 
+       stk = &sa->cmd_stack;
 
        if (sa->registered) {
                dev_info(&h->pdev->dev, "SCSI subsystem already engaged.\n");
@@ -1586,13 +1580,13 @@ retry_tur:
        return rc;
 }
 
-/* Need at least one of these error handlers to keep ../scsi/hosts.c from 
- * complaining.  Doing a host- or bus-reset can't do anything good here. 
+/* Need at least one of these error handlers to keep ../scsi/hosts.c from
+ * complaining.  Doing a host- or bus-reset can't do anything good here.
  * Despite what it might say in scsi_error.c, there may well be commands
  * on the controller, as the cciss driver registers twice, once as a block
  * device for the logical drives, and once as a scsi device, for any tape
  * drives.  So we know there are no commands out on the tape drives, but we
- * don't know there are no commands on the controller, and it is likely 
+ * don't know there are no commands on the controller, and it is likely
  * that there probably are, as the cciss block device is most commonly used
  * as a boot device (embedded controller on HP/Compaq systems.)
 */
index 0be84a3cb6d7bbb605a252ff5666c1642a1c201a..0bf2b21a62cb770a3129889b59a0b5f735eddc52 100644 (file)
@@ -96,6 +96,10 @@ static int max_part;
 static struct workqueue_struct *recv_workqueue;
 static int part_shift;
 
+static int nbd_dev_dbg_init(struct nbd_device *nbd);
+static void nbd_dev_dbg_close(struct nbd_device *nbd);
+
+
 static inline struct device *nbd_to_dev(struct nbd_device *nbd)
 {
        return disk_to_dev(nbd->disk);
@@ -120,7 +124,7 @@ static const char *nbdcmd_to_ascii(int cmd)
 
 static int nbd_size_clear(struct nbd_device *nbd, struct block_device *bdev)
 {
-       bdev->bd_inode->i_size = 0;
+       bd_set_size(bdev, 0);
        set_capacity(nbd->disk, 0);
        kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
 
@@ -129,29 +133,20 @@ static int nbd_size_clear(struct nbd_device *nbd, struct block_device *bdev)
 
 static void nbd_size_update(struct nbd_device *nbd, struct block_device *bdev)
 {
-       if (!nbd_is_connected(nbd))
-               return;
-
-       bdev->bd_inode->i_size = nbd->bytesize;
+       blk_queue_logical_block_size(nbd->disk->queue, nbd->blksize);
+       blk_queue_physical_block_size(nbd->disk->queue, nbd->blksize);
+       bd_set_size(bdev, nbd->bytesize);
        set_capacity(nbd->disk, nbd->bytesize >> 9);
        kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
 }
 
-static int nbd_size_set(struct nbd_device *nbd, struct block_device *bdev,
+static void nbd_size_set(struct nbd_device *nbd, struct block_device *bdev,
                        loff_t blocksize, loff_t nr_blocks)
 {
-       int ret;
-
-       ret = set_blocksize(bdev, blocksize);
-       if (ret)
-               return ret;
-
        nbd->blksize = blocksize;
        nbd->bytesize = blocksize * nr_blocks;
-
-       nbd_size_update(nbd, bdev);
-
-       return 0;
+       if (nbd_is_connected(nbd))
+               nbd_size_update(nbd, bdev);
 }
 
 static void nbd_end_request(struct nbd_cmd *cmd)
@@ -571,10 +566,17 @@ static int nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
        return BLK_MQ_RQ_QUEUE_OK;
 }
 
-static int nbd_add_socket(struct nbd_device *nbd, struct socket *sock)
+static int nbd_add_socket(struct nbd_device *nbd, struct block_device *bdev,
+                         unsigned long arg)
 {
+       struct socket *sock;
        struct nbd_sock **socks;
        struct nbd_sock *nsock;
+       int err;
+
+       sock = sockfd_lookup(arg, &err);
+       if (!sock)
+               return err;
 
        if (!nbd->task_setup)
                nbd->task_setup = current;
@@ -598,26 +600,20 @@ static int nbd_add_socket(struct nbd_device *nbd, struct socket *sock)
        nsock->sock = sock;
        socks[nbd->num_connections++] = nsock;
 
+       if (max_part)
+               bdev->bd_invalidated = 1;
        return 0;
 }
 
 /* Reset all properties of an NBD device */
 static void nbd_reset(struct nbd_device *nbd)
 {
-       int i;
-
-       for (i = 0; i < nbd->num_connections; i++)
-               kfree(nbd->socks[i]);
-       kfree(nbd->socks);
-       nbd->socks = NULL;
        nbd->runtime_flags = 0;
        nbd->blksize = 1024;
        nbd->bytesize = 0;
        set_capacity(nbd->disk, 0);
        nbd->flags = 0;
        nbd->tag_set.timeout = 0;
-       nbd->num_connections = 0;
-       nbd->task_setup = NULL;
        queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
 }
 
@@ -659,81 +655,143 @@ static void send_disconnects(struct nbd_device *nbd)
        }
 }
 
-static int nbd_dev_dbg_init(struct nbd_device *nbd);
-static void nbd_dev_dbg_close(struct nbd_device *nbd);
+static int nbd_disconnect(struct nbd_device *nbd, struct block_device *bdev)
+{
+       dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
+       if (!nbd->socks)
+               return -EINVAL;
 
-/* Must be called with config_lock held */
-static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
-                      unsigned int cmd, unsigned long arg)
+       mutex_unlock(&nbd->config_lock);
+       fsync_bdev(bdev);
+       mutex_lock(&nbd->config_lock);
+
+       /* Check again after getting mutex back.  */
+       if (!nbd->socks)
+               return -EINVAL;
+
+       if (!test_and_set_bit(NBD_DISCONNECT_REQUESTED,
+                             &nbd->runtime_flags))
+               send_disconnects(nbd);
+       return 0;
+}
+
+static int nbd_clear_sock(struct nbd_device *nbd, struct block_device *bdev)
 {
-       switch (cmd) {
-       case NBD_DISCONNECT: {
-               dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
-               if (!nbd->socks)
-                       return -EINVAL;
-
-               mutex_unlock(&nbd->config_lock);
-               fsync_bdev(bdev);
-               mutex_lock(&nbd->config_lock);
-
-               /* Check again after getting mutex back.  */
-               if (!nbd->socks)
-                       return -EINVAL;
-
-               if (!test_and_set_bit(NBD_DISCONNECT_REQUESTED,
-                                     &nbd->runtime_flags))
-                       send_disconnects(nbd);
-               return 0;
+       sock_shutdown(nbd);
+       nbd_clear_que(nbd);
+       kill_bdev(bdev);
+       nbd_bdev_reset(bdev);
+       /*
+        * We want to give the run thread a chance to wait for everybody
+        * to clean up and then do it's own cleanup.
+        */
+       if (!test_bit(NBD_RUNNING, &nbd->runtime_flags) &&
+           nbd->num_connections) {
+               int i;
+
+               for (i = 0; i < nbd->num_connections; i++)
+                       kfree(nbd->socks[i]);
+               kfree(nbd->socks);
+               nbd->socks = NULL;
+               nbd->num_connections = 0;
        }
+       nbd->task_setup = NULL;
 
-       case NBD_CLEAR_SOCK:
-               sock_shutdown(nbd);
-               nbd_clear_que(nbd);
-               kill_bdev(bdev);
-               nbd_bdev_reset(bdev);
-               /*
-                * We want to give the run thread a chance to wait for everybody
-                * to clean up and then do it's own cleanup.
-                */
-               if (!test_bit(NBD_RUNNING, &nbd->runtime_flags)) {
-                       int i;
-
-                       for (i = 0; i < nbd->num_connections; i++)
-                               kfree(nbd->socks[i]);
-                       kfree(nbd->socks);
-                       nbd->socks = NULL;
-                       nbd->num_connections = 0;
-                       nbd->task_setup = NULL;
-               }
-               return 0;
+       return 0;
+}
+
+static int nbd_start_device(struct nbd_device *nbd, struct block_device *bdev)
+{
+       struct recv_thread_args *args;
+       int num_connections = nbd->num_connections;
+       int error = 0, i;
 
-       case NBD_SET_SOCK: {
-               int err;
-               struct socket *sock = sockfd_lookup(arg, &err);
+       if (nbd->task_recv)
+               return -EBUSY;
+       if (!nbd->socks)
+               return -EINVAL;
+       if (num_connections > 1 &&
+           !(nbd->flags & NBD_FLAG_CAN_MULTI_CONN)) {
+               dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n");
+               error = -EINVAL;
+               goto out_err;
+       }
 
-               if (!sock)
-                       return err;
+       set_bit(NBD_RUNNING, &nbd->runtime_flags);
+       blk_mq_update_nr_hw_queues(&nbd->tag_set, nbd->num_connections);
+       args = kcalloc(num_connections, sizeof(*args), GFP_KERNEL);
+       if (!args) {
+               error = -ENOMEM;
+               goto out_err;
+       }
+       nbd->task_recv = current;
+       mutex_unlock(&nbd->config_lock);
 
-               err = nbd_add_socket(nbd, sock);
-               if (!err && max_part)
-                       bdev->bd_invalidated = 1;
+       nbd_parse_flags(nbd, bdev);
 
-               return err;
+       error = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
+       if (error) {
+               dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
+               goto out_recv;
        }
 
-       case NBD_SET_BLKSIZE: {
-               loff_t bsize = div_s64(nbd->bytesize, arg);
+       nbd_size_update(nbd, bdev);
 
-               return nbd_size_set(nbd, bdev, arg, bsize);
+       nbd_dev_dbg_init(nbd);
+       for (i = 0; i < num_connections; i++) {
+               sk_set_memalloc(nbd->socks[i]->sock->sk);
+               atomic_inc(&nbd->recv_threads);
+               INIT_WORK(&args[i].work, recv_work);
+               args[i].nbd = nbd;
+               args[i].index = i;
+               queue_work(recv_workqueue, &args[i].work);
        }
+       wait_event_interruptible(nbd->recv_wq,
+                                atomic_read(&nbd->recv_threads) == 0);
+       for (i = 0; i < num_connections; i++)
+               flush_work(&args[i].work);
+       nbd_dev_dbg_close(nbd);
+       nbd_size_clear(nbd, bdev);
+       device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
+out_recv:
+       mutex_lock(&nbd->config_lock);
+       nbd->task_recv = NULL;
+out_err:
+       clear_bit(NBD_RUNNING, &nbd->runtime_flags);
+       nbd_clear_sock(nbd, bdev);
 
-       case NBD_SET_SIZE:
-               return nbd_size_set(nbd, bdev, nbd->blksize,
-                                       div_s64(arg, nbd->blksize));
+       /* user requested, ignore socket errors */
+       if (test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
+               error = 0;
+       if (test_bit(NBD_TIMEDOUT, &nbd->runtime_flags))
+               error = -ETIMEDOUT;
 
-       case NBD_SET_SIZE_BLOCKS:
-               return nbd_size_set(nbd, bdev, nbd->blksize, arg);
+       nbd_reset(nbd);
+       return error;
+}
 
+/* Must be called with config_lock held */
+static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
+                      unsigned int cmd, unsigned long arg)
+{
+       switch (cmd) {
+       case NBD_DISCONNECT:
+               return nbd_disconnect(nbd, bdev);
+       case NBD_CLEAR_SOCK:
+               return nbd_clear_sock(nbd, bdev);
+       case NBD_SET_SOCK:
+               return nbd_add_socket(nbd, bdev, arg);
+       case NBD_SET_BLKSIZE:
+               nbd_size_set(nbd, bdev, arg,
+                            div_s64(nbd->bytesize, arg));
+               return 0;
+       case NBD_SET_SIZE:
+               nbd_size_set(nbd, bdev, nbd->blksize,
+                            div_s64(arg, nbd->blksize));
+               return 0;
+       case NBD_SET_SIZE_BLOCKS:
+               nbd_size_set(nbd, bdev, nbd->blksize, arg);
+               return 0;
        case NBD_SET_TIMEOUT:
                nbd->tag_set.timeout = arg * HZ;
                return 0;
@@ -741,85 +799,14 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
        case NBD_SET_FLAGS:
                nbd->flags = arg;
                return 0;
-
-       case NBD_DO_IT: {
-               struct recv_thread_args *args;
-               int num_connections = nbd->num_connections;
-               int error = 0, i;
-
-               if (nbd->task_recv)
-                       return -EBUSY;
-               if (!nbd->socks)
-                       return -EINVAL;
-               if (num_connections > 1 &&
-                   !(nbd->flags & NBD_FLAG_CAN_MULTI_CONN)) {
-                       dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n");
-                       error = -EINVAL;
-                       goto out_err;
-               }
-
-               set_bit(NBD_RUNNING, &nbd->runtime_flags);
-               blk_mq_update_nr_hw_queues(&nbd->tag_set, nbd->num_connections);
-               args = kcalloc(num_connections, sizeof(*args), GFP_KERNEL);
-               if (!args) {
-                       error = -ENOMEM;
-                       goto out_err;
-               }
-               nbd->task_recv = current;
-               mutex_unlock(&nbd->config_lock);
-
-               nbd_parse_flags(nbd, bdev);
-
-               error = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
-               if (error) {
-                       dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
-                       goto out_recv;
-               }
-
-               nbd_size_update(nbd, bdev);
-
-               nbd_dev_dbg_init(nbd);
-               for (i = 0; i < num_connections; i++) {
-                       sk_set_memalloc(nbd->socks[i]->sock->sk);
-                       atomic_inc(&nbd->recv_threads);
-                       INIT_WORK(&args[i].work, recv_work);
-                       args[i].nbd = nbd;
-                       args[i].index = i;
-                       queue_work(recv_workqueue, &args[i].work);
-               }
-               wait_event_interruptible(nbd->recv_wq,
-                                        atomic_read(&nbd->recv_threads) == 0);
-               for (i = 0; i < num_connections; i++)
-                       flush_work(&args[i].work);
-               nbd_dev_dbg_close(nbd);
-               nbd_size_clear(nbd, bdev);
-               device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
-out_recv:
-               mutex_lock(&nbd->config_lock);
-               nbd->task_recv = NULL;
-out_err:
-               sock_shutdown(nbd);
-               nbd_clear_que(nbd);
-               kill_bdev(bdev);
-               nbd_bdev_reset(bdev);
-
-               /* user requested, ignore socket errors */
-               if (test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
-                       error = 0;
-               if (test_bit(NBD_TIMEDOUT, &nbd->runtime_flags))
-                       error = -ETIMEDOUT;
-
-               nbd_reset(nbd);
-               return error;
-       }
-
+       case NBD_DO_IT:
+               return nbd_start_device(nbd, bdev);
        case NBD_CLEAR_QUE:
                /*
                 * This is for compatibility only.  The queue is always cleared
                 * by NBD_DO_IT or NBD_CLEAR_SOCK.
                 */
                return 0;
-
        case NBD_PRINT_DEBUG:
                /*
                 * For compatibility only, we no longer keep a list of
@@ -1134,8 +1121,10 @@ static int __init nbd_init(void)
        if (!recv_workqueue)
                return -ENOMEM;
 
-       if (register_blkdev(NBD_MAJOR, "nbd"))
+       if (register_blkdev(NBD_MAJOR, "nbd")) {
+               destroy_workqueue(recv_workqueue);
                return -EIO;
+       }
 
        nbd_dbg_init();
 
index cab157331c4eae29a65d50df6204a5a46c2587a6..3f3a3ab3d50ae02b418c27dc4a34d9ef8a44e9c4 100644 (file)
@@ -34,6 +34,7 @@ MODULE_LICENSE("GPL");
 MODULE_VERSION(DRV_MODULE_VERSION);
 
 #define VDC_TX_RING_SIZE       512
+#define VDC_DEFAULT_BLK_SIZE   512
 
 #define WAITING_FOR_LINK_UP    0x01
 #define WAITING_FOR_TX_SPACE   0x02
@@ -73,6 +74,7 @@ struct vdc_port {
        u32                     vdisk_size;
        u8                      vdisk_type;
        u8                      vdisk_mtype;
+       u32                     vdisk_phys_blksz;
 
        char                    disk_name[32];
 };
@@ -88,6 +90,7 @@ static inline struct vdc_port *to_vdc_port(struct vio_driver_state *vio)
 
 /* Ordered from largest major to lowest */
 static struct vio_version vdc_versions[] = {
+       { .major = 1, .minor = 2 },
        { .major = 1, .minor = 1 },
        { .major = 1, .minor = 0 },
 };
@@ -271,6 +274,11 @@ static int vdc_handle_attr(struct vio_driver_state *vio, void *arg)
                if (pkt->max_xfer_size < port->max_xfer_size)
                        port->max_xfer_size = pkt->max_xfer_size;
                port->vdisk_block_size = pkt->vdisk_block_size;
+
+               port->vdisk_phys_blksz = VDC_DEFAULT_BLK_SIZE;
+               if (vdc_version_supported(port, 1, 2))
+                       port->vdisk_phys_blksz = pkt->phys_block_size;
+
                return 0;
        } else {
                printk(KERN_ERR PFX "%s: Attribute NACK\n", vio->name);
@@ -754,6 +762,12 @@ static int probe_disk(struct vdc_port *port)
        if (err)
                return err;
 
+       /* Using version 1.2 means vdisk_phys_blksz should be set unless the
+        * disk is reserved by another system.
+        */
+       if (vdc_version_supported(port, 1, 2) && !port->vdisk_phys_blksz)
+               return -ENODEV;
+
        if (vdc_version_supported(port, 1, 1)) {
                /* vdisk_size should be set during the handshake, if it wasn't
                 * then the underlying disk is reserved by another system
@@ -829,6 +843,8 @@ static int probe_disk(struct vdc_port *port)
                }
        }
 
+       blk_queue_physical_block_size(q, port->vdisk_phys_blksz);
+
        pr_info(PFX "%s: %u sectors (%u MB) protocol %d.%d\n",
               g->disk_name,
               port->vdisk_size, (port->vdisk_size >> (20 - 9)),
@@ -910,7 +926,7 @@ static int vdc_port_probe(struct vio_dev *vdev, const struct vio_device_id *id)
        if (err)
                goto err_out_free_port;
 
-       port->vdisk_block_size = 512;
+       port->vdisk_block_size = VDC_DEFAULT_BLK_SIZE;
        port->max_xfer_size = ((128 * 1024) / port->vdisk_block_size);
        port->ring_cookies = ((port->max_xfer_size *
                               port->vdisk_block_size) / PAGE_SIZE) + 2;
index 67d76f21fecd9a6e48d73861a19ef8c41914e0dd..28955b94d2b26f47d7c54217d84c2a8a11af692a 100644 (file)
@@ -328,13 +328,15 @@ static void dm_softirq_done(struct request *rq)
        int rw;
 
        if (!clone) {
-               rq_end_stats(tio->md, rq);
+               struct mapped_device *md = tio->md;
+
+               rq_end_stats(md, rq);
                rw = rq_data_dir(rq);
                if (!rq->q->mq_ops)
                        blk_end_request_all(rq, tio->error);
                else
                        blk_mq_end_request(rq, tio->error);
-               rq_completed(tio->md, rw, false);
+               rq_completed(md, rw, false);
                return;
        }
 
index 685aa2d77e2526935f8f2f416ad8d8681b6c7b14..b0536cfd8e174b83a53d49391552b0c7ec64aef8 100644 (file)
@@ -214,7 +214,7 @@ static void faulty_make_request(struct mddev *mddev, struct bio *bio)
                }
        }
        if (failit) {
-               struct bio *b = bio_clone_mddev(bio, GFP_NOIO, mddev);
+               struct bio *b = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
 
                b->bi_bdev = conf->rdev->bdev;
                b->bi_private = bio;
index f1c7bbac31a580bb6f708b614696f17404badc0d..3e38e0207a3eb44339ad6431dc3557ae27d05612 100644 (file)
@@ -53,18 +53,26 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
        return conf->disks + lo;
 }
 
+/*
+ * In linear_congested() conf->raid_disks is used as a copy of
+ * mddev->raid_disks to iterate conf->disks[], because conf->raid_disks
+ * and conf->disks[] are created in linear_conf(), they are always
+ * consitent with each other, but mddev->raid_disks does not.
+ */
 static int linear_congested(struct mddev *mddev, int bits)
 {
        struct linear_conf *conf;
        int i, ret = 0;
 
-       conf = mddev->private;
+       rcu_read_lock();
+       conf = rcu_dereference(mddev->private);
 
-       for (i = 0; i < mddev->raid_disks && !ret ; i++) {
+       for (i = 0; i < conf->raid_disks && !ret ; i++) {
                struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev);
                ret |= bdi_congested(q->backing_dev_info, bits);
        }
 
+       rcu_read_unlock();
        return ret;
 }
 
@@ -144,6 +152,19 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
                        conf->disks[i-1].end_sector +
                        conf->disks[i].rdev->sectors;
 
+       /*
+        * conf->raid_disks is copy of mddev->raid_disks. The reason to
+        * keep a copy of mddev->raid_disks in struct linear_conf is,
+        * mddev->raid_disks may not be consistent with pointers number of
+        * conf->disks[] when it is updated in linear_add() and used to
+        * iterate old conf->disks[] earray in linear_congested().
+        * Here conf->raid_disks is always consitent with number of
+        * pointers in conf->disks[] array, and mddev->private is updated
+        * with rcu_assign_pointer() in linear_addr(), such race can be
+        * avoided.
+        */
+       conf->raid_disks = raid_disks;
+
        return conf;
 
 out:
@@ -196,15 +217,24 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
        if (!newconf)
                return -ENOMEM;
 
+       /* newconf->raid_disks already keeps a copy of * the increased
+        * value of mddev->raid_disks, WARN_ONCE() is just used to make
+        * sure of this. It is possible that oldconf is still referenced
+        * in linear_congested(), therefore kfree_rcu() is used to free
+        * oldconf until no one uses it anymore.
+        */
        mddev_suspend(mddev);
-       oldconf = mddev->private;
+       oldconf = rcu_dereference_protected(mddev->private,
+                       lockdep_is_held(&mddev->reconfig_mutex));
        mddev->raid_disks++;
-       mddev->private = newconf;
+       WARN_ONCE(mddev->raid_disks != newconf->raid_disks,
+               "copied raid_disks doesn't match mddev->raid_disks");
+       rcu_assign_pointer(mddev->private, newconf);
        md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
        set_capacity(mddev->gendisk, mddev->array_sectors);
        mddev_resume(mddev);
        revalidate_disk(mddev->gendisk);
-       kfree(oldconf);
+       kfree_rcu(oldconf, rcu);
        return 0;
 }
 
@@ -262,6 +292,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
                                trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
                                                      split, disk_devt(mddev->gendisk),
                                                      bio_sector);
+                       mddev_check_writesame(mddev, split);
                        generic_make_request(split);
                }
        } while (split != bio);
index b685ddd7d7f76c25553ce88abd049fd9ead545e6..8d392e6098b3295ddbebac59e418a27ae21712e5 100644 (file)
@@ -10,6 +10,7 @@ struct linear_conf
 {
        struct rcu_head         rcu;
        sector_t                array_sectors;
+       int                     raid_disks; /* a copy of mddev->raid_disks */
        struct dev_info         disks[0];
 };
 #endif
index ba485dcf1064dd463bdb93edd85157b247d0fcb5..985374f20e2e3f4d78ac1d7d77213b2e1ad7e80c 100644 (file)
@@ -190,16 +190,6 @@ struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
 }
 EXPORT_SYMBOL_GPL(bio_alloc_mddev);
 
-struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
-                           struct mddev *mddev)
-{
-       if (!mddev || !mddev->bio_set)
-               return bio_clone(bio, gfp_mask);
-
-       return bio_clone_bioset(bio, gfp_mask, mddev->bio_set);
-}
-EXPORT_SYMBOL_GPL(bio_clone_mddev);
-
 /*
  * We have a system wide 'event count' that is incremented
  * on any 'interesting' event, and readers of /proc/mdstat
@@ -5228,8 +5218,11 @@ int md_run(struct mddev *mddev)
                sysfs_notify_dirent_safe(rdev->sysfs_state);
        }
 
-       if (mddev->bio_set == NULL)
+       if (mddev->bio_set == NULL) {
                mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0);
+               if (!mddev->bio_set)
+                       return -ENOMEM;
+       }
 
        spin_lock(&pers_lock);
        pers = find_pers(mddev->level, mddev->clevel);
@@ -8980,7 +8973,14 @@ static __exit void md_exit(void)
 
        for_each_mddev(mddev, tmp) {
                export_array(mddev);
+               mddev->ctime = 0;
                mddev->hold_active = 0;
+               /*
+                * for_each_mddev() will call mddev_put() at the end of each
+                * iteration.  As the mddev is now fully clear, this will
+                * schedule the mddev for destruction by a workqueue, and the
+                * destroy_workqueue() below will wait for that to complete.
+                */
        }
        destroy_workqueue(md_misc_wq);
        destroy_workqueue(md_wq);
index 2a514036a83dc0da07c0966b7fe247c18356bbbf..b8859cbf84b618b39ed3d92a2887e8764c403919 100644 (file)
@@ -673,8 +673,6 @@ extern void md_rdev_clear(struct md_rdev *rdev);
 
 extern void mddev_suspend(struct mddev *mddev);
 extern void mddev_resume(struct mddev *mddev);
-extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
-                                  struct mddev *mddev);
 extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
                                   struct mddev *mddev);
 
@@ -710,4 +708,11 @@ static inline void mddev_clear_unsupported_flags(struct mddev *mddev,
 {
        mddev->flags &= ~unsupported_flags;
 }
+
+static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio)
+{
+       if (bio_op(bio) == REQ_OP_WRITE_SAME &&
+           !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
+               mddev->queue->limits.max_write_same_sectors = 0;
+}
 #endif /* _MD_MD_H */
index d457afa672d57a172965aa000913fa6ff6625878..79a12b59250bbca870be857eb7cf350c0c9b53ad 100644 (file)
@@ -138,6 +138,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
        mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT;
        mp_bh->bio.bi_end_io = multipath_end_request;
        mp_bh->bio.bi_private = mp_bh;
+       mddev_check_writesame(mddev, &mp_bh->bio);
        generic_make_request(&mp_bh->bio);
        return;
 }
index d6585239bff22809edbcaf3881dc2f2ae0a2f41e..93347ca7c7a617e097ccafcbedbecdfa396d4968 100644 (file)
@@ -503,6 +503,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
                                trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
                                                      split, disk_devt(mddev->gendisk),
                                                      bio_sector);
+                       mddev_check_writesame(mddev, split);
                        generic_make_request(split);
                }
        } while (split != bio);
index 830ff2b203463ef075d53a6c7a2ae22e0ec2c7d9..7453d94eeed700c8ac30da1b8d7857b4788fdbd5 100644 (file)
@@ -71,9 +71,8 @@
  */
 static int max_queued_requests = 1024;
 
-static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
-                         sector_t bi_sector);
-static void lower_barrier(struct r1conf *conf);
+static void allow_barrier(struct r1conf *conf, sector_t sector_nr);
+static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
 
 #define raid1_log(md, fmt, args...)                            \
        do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
@@ -100,7 +99,6 @@ static void r1bio_pool_free(void *r1_bio, void *data)
 #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
 #define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
 #define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
-#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
 
 static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 {
@@ -205,6 +203,7 @@ static void free_r1bio(struct r1bio *r1_bio)
 static void put_buf(struct r1bio *r1_bio)
 {
        struct r1conf *conf = r1_bio->mddev->private;
+       sector_t sect = r1_bio->sector;
        int i;
 
        for (i = 0; i < conf->raid_disks * 2; i++) {
@@ -215,7 +214,7 @@ static void put_buf(struct r1bio *r1_bio)
 
        mempool_free(r1_bio, conf->r1buf_pool);
 
-       lower_barrier(conf);
+       lower_barrier(conf, sect);
 }
 
 static void reschedule_retry(struct r1bio *r1_bio)
@@ -223,10 +222,12 @@ static void reschedule_retry(struct r1bio *r1_bio)
        unsigned long flags;
        struct mddev *mddev = r1_bio->mddev;
        struct r1conf *conf = mddev->private;
+       int idx;
 
+       idx = sector_to_idx(r1_bio->sector);
        spin_lock_irqsave(&conf->device_lock, flags);
        list_add(&r1_bio->retry_list, &conf->retry_list);
-       conf->nr_queued ++;
+       atomic_inc(&conf->nr_queued[idx]);
        spin_unlock_irqrestore(&conf->device_lock, flags);
 
        wake_up(&conf->wait_barrier);
@@ -243,7 +244,6 @@ static void call_bio_endio(struct r1bio *r1_bio)
        struct bio *bio = r1_bio->master_bio;
        int done;
        struct r1conf *conf = r1_bio->mddev->private;
-       sector_t start_next_window = r1_bio->start_next_window;
        sector_t bi_sector = bio->bi_iter.bi_sector;
 
        if (bio->bi_phys_segments) {
@@ -269,7 +269,7 @@ static void call_bio_endio(struct r1bio *r1_bio)
                 * Wake up any possible resync thread that waits for the device
                 * to go idle.
                 */
-               allow_barrier(conf, start_next_window, bi_sector);
+               allow_barrier(conf, bi_sector);
        }
 }
 
@@ -517,6 +517,25 @@ static void raid1_end_write_request(struct bio *bio)
                bio_put(to_put);
 }
 
+static sector_t align_to_barrier_unit_end(sector_t start_sector,
+                                         sector_t sectors)
+{
+       sector_t len;
+
+       WARN_ON(sectors == 0);
+       /*
+        * len is the number of sectors from start_sector to end of the
+        * barrier unit which start_sector belongs to.
+        */
+       len = round_up(start_sector + 1, BARRIER_UNIT_SECTOR_SIZE) -
+             start_sector;
+
+       if (len > sectors)
+               len = sectors;
+
+       return len;
+}
+
 /*
  * This routine returns the disk from which the requested read should
  * be done. There is a per-array 'next expected sequential IO' sector
@@ -813,168 +832,228 @@ static void flush_pending_writes(struct r1conf *conf)
  */
 static void raise_barrier(struct r1conf *conf, sector_t sector_nr)
 {
+       int idx = sector_to_idx(sector_nr);
+
        spin_lock_irq(&conf->resync_lock);
 
        /* Wait until no block IO is waiting */
-       wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
+       wait_event_lock_irq(conf->wait_barrier,
+                           !atomic_read(&conf->nr_waiting[idx]),
                            conf->resync_lock);
 
        /* block any new IO from starting */
-       conf->barrier++;
-       conf->next_resync = sector_nr;
+       atomic_inc(&conf->barrier[idx]);
+       /*
+        * In raise_barrier() we firstly increase conf->barrier[idx] then
+        * check conf->nr_pending[idx]. In _wait_barrier() we firstly
+        * increase conf->nr_pending[idx] then check conf->barrier[idx].
+        * A memory barrier here to make sure conf->nr_pending[idx] won't
+        * be fetched before conf->barrier[idx] is increased. Otherwise
+        * there will be a race between raise_barrier() and _wait_barrier().
+        */
+       smp_mb__after_atomic();
 
        /* For these conditions we must wait:
         * A: while the array is in frozen state
-        * B: while barrier >= RESYNC_DEPTH, meaning resync reach
-        *    the max count which allowed.
-        * C: next_resync + RESYNC_SECTORS > start_next_window, meaning
-        *    next resync will reach to the window which normal bios are
-        *    handling.
-        * D: while there are any active requests in the current window.
+        * B: while conf->nr_pending[idx] is not 0, meaning regular I/O
+        *    existing in corresponding I/O barrier bucket.
+        * C: while conf->barrier[idx] >= RESYNC_DEPTH, meaning reaches
+        *    max resync count which allowed on current I/O barrier bucket.
         */
        wait_event_lock_irq(conf->wait_barrier,
                            !conf->array_frozen &&
-                           conf->barrier < RESYNC_DEPTH &&
-                           conf->current_window_requests == 0 &&
-                           (conf->start_next_window >=
-                            conf->next_resync + RESYNC_SECTORS),
+                            !atomic_read(&conf->nr_pending[idx]) &&
+                            atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH,
                            conf->resync_lock);
 
-       conf->nr_pending++;
+       atomic_inc(&conf->nr_pending[idx]);
        spin_unlock_irq(&conf->resync_lock);
 }
 
-static void lower_barrier(struct r1conf *conf)
+static void lower_barrier(struct r1conf *conf, sector_t sector_nr)
 {
-       unsigned long flags;
-       BUG_ON(conf->barrier <= 0);
-       spin_lock_irqsave(&conf->resync_lock, flags);
-       conf->barrier--;
-       conf->nr_pending--;
-       spin_unlock_irqrestore(&conf->resync_lock, flags);
+       int idx = sector_to_idx(sector_nr);
+
+       BUG_ON(atomic_read(&conf->barrier[idx]) <= 0);
+
+       atomic_dec(&conf->barrier[idx]);
+       atomic_dec(&conf->nr_pending[idx]);
        wake_up(&conf->wait_barrier);
 }
 
-static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
+static void _wait_barrier(struct r1conf *conf, int idx)
 {
-       bool wait = false;
+       /*
+        * We need to increase conf->nr_pending[idx] very early here,
+        * then raise_barrier() can be blocked when it waits for
+        * conf->nr_pending[idx] to be 0. Then we can avoid holding
+        * conf->resync_lock when there is no barrier raised in same
+        * barrier unit bucket. Also if the array is frozen, I/O
+        * should be blocked until array is unfrozen.
+        */
+       atomic_inc(&conf->nr_pending[idx]);
+       /*
+        * In _wait_barrier() we firstly increase conf->nr_pending[idx], then
+        * check conf->barrier[idx]. In raise_barrier() we firstly increase
+        * conf->barrier[idx], then check conf->nr_pending[idx]. A memory
+        * barrier is necessary here to make sure conf->barrier[idx] won't be
+        * fetched before conf->nr_pending[idx] is increased. Otherwise there
+        * will be a race between _wait_barrier() and raise_barrier().
+        */
+       smp_mb__after_atomic();
 
-       if (conf->array_frozen || !bio)
-               wait = true;
-       else if (conf->barrier && bio_data_dir(bio) == WRITE) {
-               if ((conf->mddev->curr_resync_completed
-                    >= bio_end_sector(bio)) ||
-                   (conf->start_next_window + NEXT_NORMALIO_DISTANCE
-                    <= bio->bi_iter.bi_sector))
-                       wait = false;
-               else
-                       wait = true;
-       }
+       /*
+        * Don't worry about checking two atomic_t variables at same time
+        * here. If during we check conf->barrier[idx], the array is
+        * frozen (conf->array_frozen is 1), and chonf->barrier[idx] is
+        * 0, it is safe to return and make the I/O continue. Because the
+        * array is frozen, all I/O returned here will eventually complete
+        * or be queued, no race will happen. See code comment in
+        * frozen_array().
+        */
+       if (!READ_ONCE(conf->array_frozen) &&
+           !atomic_read(&conf->barrier[idx]))
+               return;
 
-       return wait;
+       /*
+        * After holding conf->resync_lock, conf->nr_pending[idx]
+        * should be decreased before waiting for barrier to drop.
+        * Otherwise, we may encounter a race condition because
+        * raise_barrer() might be waiting for conf->nr_pending[idx]
+        * to be 0 at same time.
+        */
+       spin_lock_irq(&conf->resync_lock);
+       atomic_inc(&conf->nr_waiting[idx]);
+       atomic_dec(&conf->nr_pending[idx]);
+       /*
+        * In case freeze_array() is waiting for
+        * get_unqueued_pending() == extra
+        */
+       wake_up(&conf->wait_barrier);
+       /* Wait for the barrier in same barrier unit bucket to drop. */
+       wait_event_lock_irq(conf->wait_barrier,
+                           !conf->array_frozen &&
+                            !atomic_read(&conf->barrier[idx]),
+                           conf->resync_lock);
+       atomic_inc(&conf->nr_pending[idx]);
+       atomic_dec(&conf->nr_waiting[idx]);
+       spin_unlock_irq(&conf->resync_lock);
 }
 
-static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
+static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
 {
-       sector_t sector = 0;
+       int idx = sector_to_idx(sector_nr);
 
-       spin_lock_irq(&conf->resync_lock);
-       if (need_to_wait_for_sync(conf, bio)) {
-               conf->nr_waiting++;
-               /* Wait for the barrier to drop.
-                * However if there are already pending
-                * requests (preventing the barrier from
-                * rising completely), and the
-                * per-process bio queue isn't empty,
-                * then don't wait, as we need to empty
-                * that queue to allow conf->start_next_window
-                * to increase.
-                */
-               raid1_log(conf->mddev, "wait barrier");
-               wait_event_lock_irq(conf->wait_barrier,
-                                   !conf->array_frozen &&
-                                   (!conf->barrier ||
-                                    ((conf->start_next_window <
-                                      conf->next_resync + RESYNC_SECTORS) &&
-                                     current->bio_list &&
-                                     !bio_list_empty(current->bio_list))),
-                                   conf->resync_lock);
-               conf->nr_waiting--;
-       }
-
-       if (bio && bio_data_dir(bio) == WRITE) {
-               if (bio->bi_iter.bi_sector >= conf->next_resync) {
-                       if (conf->start_next_window == MaxSector)
-                               conf->start_next_window =
-                                       conf->next_resync +
-                                       NEXT_NORMALIO_DISTANCE;
-
-                       if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
-                           <= bio->bi_iter.bi_sector)
-                               conf->next_window_requests++;
-                       else
-                               conf->current_window_requests++;
-                       sector = conf->start_next_window;
-               }
-       }
+       /*
+        * Very similar to _wait_barrier(). The difference is, for read
+        * I/O we don't need wait for sync I/O, but if the whole array
+        * is frozen, the read I/O still has to wait until the array is
+        * unfrozen. Since there is no ordering requirement with
+        * conf->barrier[idx] here, memory barrier is unnecessary as well.
+        */
+       atomic_inc(&conf->nr_pending[idx]);
 
-       conf->nr_pending++;
+       if (!READ_ONCE(conf->array_frozen))
+               return;
+
+       spin_lock_irq(&conf->resync_lock);
+       atomic_inc(&conf->nr_waiting[idx]);
+       atomic_dec(&conf->nr_pending[idx]);
+       /*
+        * In case freeze_array() is waiting for
+        * get_unqueued_pending() == extra
+        */
+       wake_up(&conf->wait_barrier);
+       /* Wait for array to be unfrozen */
+       wait_event_lock_irq(conf->wait_barrier,
+                           !conf->array_frozen,
+                           conf->resync_lock);
+       atomic_inc(&conf->nr_pending[idx]);
+       atomic_dec(&conf->nr_waiting[idx]);
        spin_unlock_irq(&conf->resync_lock);
-       return sector;
 }
 
-static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
-                         sector_t bi_sector)
+static void wait_barrier(struct r1conf *conf, sector_t sector_nr)
 {
-       unsigned long flags;
+       int idx = sector_to_idx(sector_nr);
 
-       spin_lock_irqsave(&conf->resync_lock, flags);
-       conf->nr_pending--;
-       if (start_next_window) {
-               if (start_next_window == conf->start_next_window) {
-                       if (conf->start_next_window + NEXT_NORMALIO_DISTANCE
-                           <= bi_sector)
-                               conf->next_window_requests--;
-                       else
-                               conf->current_window_requests--;
-               } else
-                       conf->current_window_requests--;
-
-               if (!conf->current_window_requests) {
-                       if (conf->next_window_requests) {
-                               conf->current_window_requests =
-                                       conf->next_window_requests;
-                               conf->next_window_requests = 0;
-                               conf->start_next_window +=
-                                       NEXT_NORMALIO_DISTANCE;
-                       } else
-                               conf->start_next_window = MaxSector;
-               }
-       }
-       spin_unlock_irqrestore(&conf->resync_lock, flags);
+       _wait_barrier(conf, idx);
+}
+
+static void wait_all_barriers(struct r1conf *conf)
+{
+       int idx;
+
+       for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
+               _wait_barrier(conf, idx);
+}
+
+static void _allow_barrier(struct r1conf *conf, int idx)
+{
+       atomic_dec(&conf->nr_pending[idx]);
        wake_up(&conf->wait_barrier);
 }
 
+static void allow_barrier(struct r1conf *conf, sector_t sector_nr)
+{
+       int idx = sector_to_idx(sector_nr);
+
+       _allow_barrier(conf, idx);
+}
+
+static void allow_all_barriers(struct r1conf *conf)
+{
+       int idx;
+
+       for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
+               _allow_barrier(conf, idx);
+}
+
+/* conf->resync_lock should be held */
+static int get_unqueued_pending(struct r1conf *conf)
+{
+       int idx, ret;
+
+       for (ret = 0, idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
+               ret += atomic_read(&conf->nr_pending[idx]) -
+                       atomic_read(&conf->nr_queued[idx]);
+
+       return ret;
+}
+
 static void freeze_array(struct r1conf *conf, int extra)
 {
-       /* stop syncio and normal IO and wait for everything to
+       /* Stop sync I/O and normal I/O and wait for everything to
         * go quite.
-        * We wait until nr_pending match nr_queued+extra
-        * This is called in the context of one normal IO request
-        * that has failed. Thus any sync request that might be pending
-        * will be blocked by nr_pending, and we need to wait for
-        * pending IO requests to complete or be queued for re-try.
-        * Thus the number queued (nr_queued) plus this request (extra)
-        * must match the number of pending IOs (nr_pending) before
-        * we continue.
+        * This is called in two situations:
+        * 1) management command handlers (reshape, remove disk, quiesce).
+        * 2) one normal I/O request failed.
+
+        * After array_frozen is set to 1, new sync IO will be blocked at
+        * raise_barrier(), and new normal I/O will blocked at _wait_barrier()
+        * or wait_read_barrier(). The flying I/Os will either complete or be
+        * queued. When everything goes quite, there are only queued I/Os left.
+
+        * Every flying I/O contributes to a conf->nr_pending[idx], idx is the
+        * barrier bucket index which this I/O request hits. When all sync and
+        * normal I/O are queued, sum of all conf->nr_pending[] will match sum
+        * of all conf->nr_queued[]. But normal I/O failure is an exception,
+        * in handle_read_error(), we may call freeze_array() before trying to
+        * fix the read error. In this case, the error read I/O is not queued,
+        * so get_unqueued_pending() == 1.
+        *
+        * Therefore before this function returns, we need to wait until
+        * get_unqueued_pendings(conf) gets equal to extra. For
+        * normal I/O context, extra is 1, in rested situations extra is 0.
         */
        spin_lock_irq(&conf->resync_lock);
        conf->array_frozen = 1;
        raid1_log(conf->mddev, "wait freeze");
-       wait_event_lock_irq_cmd(conf->wait_barrier,
-                               conf->nr_pending == conf->nr_queued+extra,
-                               conf->resync_lock,
-                               flush_pending_writes(conf));
+       wait_event_lock_irq_cmd(
+               conf->wait_barrier,
+               get_unqueued_pending(conf) == extra,
+               conf->resync_lock,
+               flush_pending_writes(conf));
        spin_unlock_irq(&conf->resync_lock);
 }
 static void unfreeze_array(struct r1conf *conf)
@@ -982,8 +1061,8 @@ static void unfreeze_array(struct r1conf *conf)
        /* reverse the effect of the freeze */
        spin_lock_irq(&conf->resync_lock);
        conf->array_frozen = 0;
-       wake_up(&conf->wait_barrier);
        spin_unlock_irq(&conf->resync_lock);
+       wake_up(&conf->wait_barrier);
 }
 
 /* duplicate the data pages for behind I/O
@@ -1070,11 +1149,28 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
        kfree(plug);
 }
 
-static void raid1_read_request(struct mddev *mddev, struct bio *bio,
-                                struct r1bio *r1_bio)
+static inline struct r1bio *
+alloc_r1bio(struct mddev *mddev, struct bio *bio, sector_t sectors_handled)
+{
+       struct r1conf *conf = mddev->private;
+       struct r1bio *r1_bio;
+
+       r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+
+       r1_bio->master_bio = bio;
+       r1_bio->sectors = bio_sectors(bio) - sectors_handled;
+       r1_bio->state = 0;
+       r1_bio->mddev = mddev;
+       r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
+
+       return r1_bio;
+}
+
+static void raid1_read_request(struct mddev *mddev, struct bio *bio)
 {
        struct r1conf *conf = mddev->private;
        struct raid1_info *mirror;
+       struct r1bio *r1_bio;
        struct bio *read_bio;
        struct bitmap *bitmap = mddev->bitmap;
        const int op = bio_op(bio);
@@ -1083,8 +1179,29 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
        int max_sectors;
        int rdisk;
 
-       wait_barrier(conf, bio);
+       /*
+        * Still need barrier for READ in case that whole
+        * array is frozen.
+        */
+       wait_read_barrier(conf, bio->bi_iter.bi_sector);
+
+       r1_bio = alloc_r1bio(mddev, bio, 0);
 
+       /*
+        * We might need to issue multiple reads to different
+        * devices if there are bad blocks around, so we keep
+        * track of the number of reads in bio->bi_phys_segments.
+        * If this is 0, there is only one r1_bio and no locking
+        * will be needed when requests complete.  If it is
+        * non-zero, then it is the number of not-completed requests.
+        */
+       bio->bi_phys_segments = 0;
+       bio_clear_flag(bio, BIO_SEG_VALID);
+
+       /*
+        * make_request() can abort the operation when read-ahead is being
+        * used and no empty request is available.
+        */
 read_again:
        rdisk = read_balance(conf, r1_bio, &max_sectors);
 
@@ -1106,9 +1223,8 @@ read_again:
                           atomic_read(&bitmap->behind_writes) == 0);
        }
        r1_bio->read_disk = rdisk;
-       r1_bio->start_next_window = 0;
 
-       read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+       read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
        bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
                 max_sectors);
 
@@ -1151,22 +1267,16 @@ read_again:
                 */
                reschedule_retry(r1_bio);
 
-               r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
-
-               r1_bio->master_bio = bio;
-               r1_bio->sectors = bio_sectors(bio) - sectors_handled;
-               r1_bio->state = 0;
-               r1_bio->mddev = mddev;
-               r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
+               r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
                goto read_again;
        } else
                generic_make_request(read_bio);
 }
 
-static void raid1_write_request(struct mddev *mddev, struct bio *bio,
-                               struct r1bio *r1_bio)
+static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 {
        struct r1conf *conf = mddev->private;
+       struct r1bio *r1_bio;
        int i, disks;
        struct bitmap *bitmap = mddev->bitmap;
        unsigned long flags;
@@ -1176,7 +1286,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
        int first_clone;
        int sectors_handled;
        int max_sectors;
-       sector_t start_next_window;
 
        /*
         * Register the new request and wait if the reconstruction
@@ -1212,7 +1321,19 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
                }
                finish_wait(&conf->wait_barrier, &w);
        }
-       start_next_window = wait_barrier(conf, bio);
+       wait_barrier(conf, bio->bi_iter.bi_sector);
+
+       r1_bio = alloc_r1bio(mddev, bio, 0);
+
+       /* We might need to issue multiple writes to different
+        * devices if there are bad blocks around, so we keep
+        * track of the number of writes in bio->bi_phys_segments.
+        * If this is 0, there is only one r1_bio and no locking
+        * will be needed when requests complete.  If it is
+        * non-zero, then it is the number of not-completed requests.
+        */
+       bio->bi_phys_segments = 0;
+       bio_clear_flag(bio, BIO_SEG_VALID);
 
        if (conf->pending_count >= max_queued_requests) {
                md_wakeup_thread(mddev->thread);
@@ -1233,7 +1354,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 
        disks = conf->raid_disks * 2;
  retry_write:
-       r1_bio->start_next_window = start_next_window;
        blocked_rdev = NULL;
        rcu_read_lock();
        max_sectors = r1_bio->sectors;
@@ -1300,25 +1420,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
        if (unlikely(blocked_rdev)) {
                /* Wait for this device to become unblocked */
                int j;
-               sector_t old = start_next_window;
 
                for (j = 0; j < i; j++)
                        if (r1_bio->bios[j])
                                rdev_dec_pending(conf->mirrors[j].rdev, mddev);
                r1_bio->state = 0;
-               allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector);
+               allow_barrier(conf, bio->bi_iter.bi_sector);
                raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
                md_wait_for_blocked_rdev(blocked_rdev, mddev);
-               start_next_window = wait_barrier(conf, bio);
-               /*
-                * We must make sure the multi r1bios of bio have
-                * the same value of bi_phys_segments
-                */
-               if (bio->bi_phys_segments && old &&
-                   old != start_next_window)
-                       /* Wait for the former r1bio(s) to complete */
-                       wait_event(conf->wait_barrier,
-                                  bio->bi_phys_segments == 1);
+               wait_barrier(conf, bio->bi_iter.bi_sector);
                goto retry_write;
        }
 
@@ -1341,13 +1451,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 
        first_clone = 1;
        for (i = 0; i < disks; i++) {
-               struct bio *mbio;
+               struct bio *mbio = NULL;
+               sector_t offset;
                if (!r1_bio->bios[i])
                        continue;
 
-               mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-               bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector,
-                        max_sectors);
+               offset = r1_bio->sector - bio->bi_iter.bi_sector;
 
                if (first_clone) {
                        /* do behind I/O ?
@@ -1357,8 +1466,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
                        if (bitmap &&
                            (atomic_read(&bitmap->behind_writes)
                             < mddev->bitmap_info.max_write_behind) &&
-                           !waitqueue_active(&bitmap->behind_wait))
+                           !waitqueue_active(&bitmap->behind_wait)) {
+                               mbio = bio_clone_bioset_partial(bio, GFP_NOIO,
+                                                               mddev->bio_set,
+                                                               offset << 9,
+                                                               max_sectors << 9);
                                alloc_behind_pages(mbio, r1_bio);
+                       }
 
                        bitmap_startwrite(bitmap, r1_bio->sector,
                                          r1_bio->sectors,
@@ -1366,6 +1480,19 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
                                                   &r1_bio->state));
                        first_clone = 0;
                }
+
+               if (!mbio) {
+                       if (r1_bio->behind_bvecs)
+                               mbio = bio_clone_bioset_partial(bio, GFP_NOIO,
+                                                               mddev->bio_set,
+                                                               offset << 9,
+                                                               max_sectors << 9);
+                       else {
+                               mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
+                               bio_trim(mbio, offset, max_sectors);
+                       }
+               }
+
                if (r1_bio->behind_bvecs) {
                        struct bio_vec *bvec;
                        int j;
@@ -1385,8 +1512,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
                                   conf->mirrors[i].rdev->data_offset);
                mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
                mbio->bi_end_io = raid1_end_write_request;
-               mbio->bi_opf = bio_op(bio) |
-                       (bio->bi_opf & (REQ_SYNC | REQ_PREFLUSH | REQ_FUA));
+               mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA));
                if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) &&
                    !test_bit(WriteMostly, &conf->mirrors[i].rdev->flags) &&
                    conf->raid_disks - mddev->degraded > 1)
@@ -1427,12 +1553,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
                /* We need another r1_bio.  It has already been counted
                 * in bio->bi_phys_segments
                 */
-               r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
-               r1_bio->master_bio = bio;
-               r1_bio->sectors = bio_sectors(bio) - sectors_handled;
-               r1_bio->state = 0;
-               r1_bio->mddev = mddev;
-               r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
+               r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
                goto retry_write;
        }
 
@@ -1444,36 +1565,30 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 
 static void raid1_make_request(struct mddev *mddev, struct bio *bio)
 {
-       struct r1conf *conf = mddev->private;
-       struct r1bio *r1_bio;
+       struct bio *split;
+       sector_t sectors;
 
-       /*
-        * make_request() can abort the operation when read-ahead is being
-        * used and no empty request is available.
-        *
-        */
-       r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
-
-       r1_bio->master_bio = bio;
-       r1_bio->sectors = bio_sectors(bio);
-       r1_bio->state = 0;
-       r1_bio->mddev = mddev;
-       r1_bio->sector = bio->bi_iter.bi_sector;
+       if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
+               md_flush_request(mddev, bio);
+               return;
+       }
 
-       /*
-        * We might need to issue multiple reads to different devices if there
-        * are bad blocks around, so we keep track of the number of reads in
-        * bio->bi_phys_segments.  If this is 0, there is only one r1_bio and
-        * no locking will be needed when requests complete.  If it is
-        * non-zero, then it is the number of not-completed requests.
-        */
-       bio->bi_phys_segments = 0;
-       bio_clear_flag(bio, BIO_SEG_VALID);
+       /* if bio exceeds barrier unit boundary, split it */
+       do {
+               sectors = align_to_barrier_unit_end(
+                               bio->bi_iter.bi_sector, bio_sectors(bio));
+               if (sectors < bio_sectors(bio)) {
+                       split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
+                       bio_chain(split, bio);
+               } else {
+                       split = bio;
+               }
 
-       if (bio_data_dir(bio) == READ)
-               raid1_read_request(mddev, bio, r1_bio);
-       else
-               raid1_write_request(mddev, bio, r1_bio);
+               if (bio_data_dir(split) == READ)
+                       raid1_read_request(mddev, split);
+               else
+                       raid1_write_request(mddev, split);
+       } while (split != bio);
 }
 
 static void raid1_status(struct seq_file *seq, struct mddev *mddev)
@@ -1564,19 +1679,11 @@ static void print_conf(struct r1conf *conf)
 
 static void close_sync(struct r1conf *conf)
 {
-       wait_barrier(conf, NULL);
-       allow_barrier(conf, 0, 0);
+       wait_all_barriers(conf);
+       allow_all_barriers(conf);
 
        mempool_destroy(conf->r1buf_pool);
        conf->r1buf_pool = NULL;
-
-       spin_lock_irq(&conf->resync_lock);
-       conf->next_resync = MaxSector - 2 * NEXT_NORMALIO_DISTANCE;
-       conf->start_next_window = MaxSector;
-       conf->current_window_requests +=
-               conf->next_window_requests;
-       conf->next_window_requests = 0;
-       spin_unlock_irq(&conf->resync_lock);
 }
 
 static int raid1_spare_active(struct mddev *mddev)
@@ -2273,7 +2380,8 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
 
                        wbio->bi_vcnt = vcnt;
                } else {
-                       wbio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
+                       wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
+                                             mddev->bio_set);
                }
 
                bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
@@ -2323,8 +2431,9 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
 
 static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
 {
-       int m;
+       int m, idx;
        bool fail = false;
+
        for (m = 0; m < conf->raid_disks * 2 ; m++)
                if (r1_bio->bios[m] == IO_MADE_GOOD) {
                        struct md_rdev *rdev = conf->mirrors[m].rdev;
@@ -2350,8 +2459,14 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
        if (fail) {
                spin_lock_irq(&conf->device_lock);
                list_add(&r1_bio->retry_list, &conf->bio_end_io_list);
-               conf->nr_queued++;
+               idx = sector_to_idx(r1_bio->sector);
+               atomic_inc(&conf->nr_queued[idx]);
                spin_unlock_irq(&conf->device_lock);
+               /*
+                * In case freeze_array() is waiting for condition
+                * get_unqueued_pending() == extra to be true.
+                */
+               wake_up(&conf->wait_barrier);
                md_wakeup_thread(conf->mddev->thread);
        } else {
                if (test_bit(R1BIO_WriteError, &r1_bio->state))
@@ -2411,7 +2526,8 @@ read_more:
                const unsigned long do_sync
                        = r1_bio->master_bio->bi_opf & REQ_SYNC;
                r1_bio->read_disk = disk;
-               bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
+               bio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
+                                    mddev->bio_set);
                bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector,
                         max_sectors);
                r1_bio->bios[r1_bio->read_disk] = bio;
@@ -2445,15 +2561,8 @@ read_more:
                        generic_make_request(bio);
                        bio = NULL;
 
-                       r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
-
-                       r1_bio->master_bio = mbio;
-                       r1_bio->sectors = bio_sectors(mbio) - sectors_handled;
-                       r1_bio->state = 0;
+                       r1_bio = alloc_r1bio(mddev, mbio, sectors_handled);
                        set_bit(R1BIO_ReadError, &r1_bio->state);
-                       r1_bio->mddev = mddev;
-                       r1_bio->sector = mbio->bi_iter.bi_sector +
-                               sectors_handled;
 
                        goto read_more;
                } else {
@@ -2472,6 +2581,7 @@ static void raid1d(struct md_thread *thread)
        struct r1conf *conf = mddev->private;
        struct list_head *head = &conf->retry_list;
        struct blk_plug plug;
+       int idx;
 
        md_check_recovery(mddev);
 
@@ -2479,17 +2589,15 @@ static void raid1d(struct md_thread *thread)
            !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
                LIST_HEAD(tmp);
                spin_lock_irqsave(&conf->device_lock, flags);
-               if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
-                       while (!list_empty(&conf->bio_end_io_list)) {
-                               list_move(conf->bio_end_io_list.prev, &tmp);
-                               conf->nr_queued--;
-                       }
-               }
+               if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
+                       list_splice_init(&conf->bio_end_io_list, &tmp);
                spin_unlock_irqrestore(&conf->device_lock, flags);
                while (!list_empty(&tmp)) {
                        r1_bio = list_first_entry(&tmp, struct r1bio,
                                                  retry_list);
                        list_del(&r1_bio->retry_list);
+                       idx = sector_to_idx(r1_bio->sector);
+                       atomic_dec(&conf->nr_queued[idx]);
                        if (mddev->degraded)
                                set_bit(R1BIO_Degraded, &r1_bio->state);
                        if (test_bit(R1BIO_WriteError, &r1_bio->state))
@@ -2510,7 +2618,8 @@ static void raid1d(struct md_thread *thread)
                }
                r1_bio = list_entry(head->prev, struct r1bio, retry_list);
                list_del(head->prev);
-               conf->nr_queued--;
+               idx = sector_to_idx(r1_bio->sector);
+               atomic_dec(&conf->nr_queued[idx]);
                spin_unlock_irqrestore(&conf->device_lock, flags);
 
                mddev = r1_bio->mddev;
@@ -2549,7 +2658,6 @@ static int init_resync(struct r1conf *conf)
                                          conf->poolinfo);
        if (!conf->r1buf_pool)
                return -ENOMEM;
-       conf->next_resync = 0;
        return 0;
 }
 
@@ -2578,6 +2686,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
        int still_degraded = 0;
        int good_sectors = RESYNC_SECTORS;
        int min_bad = 0; /* number of sectors that are bad in all devices */
+       int idx = sector_to_idx(sector_nr);
 
        if (!conf->r1buf_pool)
                if (init_resync(conf))
@@ -2627,7 +2736,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
         * If there is non-resync activity waiting for a turn, then let it
         * though before starting on this new sync request.
         */
-       if (conf->nr_waiting)
+       if (atomic_read(&conf->nr_waiting[idx]))
                schedule_timeout_uninterruptible(1);
 
        /* we are incrementing sector_nr below. To be safe, we check against
@@ -2654,6 +2763,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
        r1_bio->sector = sector_nr;
        r1_bio->state = 0;
        set_bit(R1BIO_IsSync, &r1_bio->state);
+       /* make sure good_sectors won't go across barrier unit boundary */
+       good_sectors = align_to_barrier_unit_end(sector_nr, good_sectors);
 
        for (i = 0; i < conf->raid_disks * 2; i++) {
                struct md_rdev *rdev;
@@ -2884,6 +2995,26 @@ static struct r1conf *setup_conf(struct mddev *mddev)
        if (!conf)
                goto abort;
 
+       conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR,
+                                  sizeof(atomic_t), GFP_KERNEL);
+       if (!conf->nr_pending)
+               goto abort;
+
+       conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR,
+                                  sizeof(atomic_t), GFP_KERNEL);
+       if (!conf->nr_waiting)
+               goto abort;
+
+       conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR,
+                                 sizeof(atomic_t), GFP_KERNEL);
+       if (!conf->nr_queued)
+               goto abort;
+
+       conf->barrier = kcalloc(BARRIER_BUCKETS_NR,
+                               sizeof(atomic_t), GFP_KERNEL);
+       if (!conf->barrier)
+               goto abort;
+
        conf->mirrors = kzalloc(sizeof(struct raid1_info)
                                * mddev->raid_disks * 2,
                                 GFP_KERNEL);
@@ -2939,9 +3070,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
        conf->pending_count = 0;
        conf->recovery_disabled = mddev->recovery_disabled - 1;
 
-       conf->start_next_window = MaxSector;
-       conf->current_window_requests = conf->next_window_requests = 0;
-
        err = -EIO;
        for (i = 0; i < conf->raid_disks * 2; i++) {
 
@@ -2984,6 +3112,10 @@ static struct r1conf *setup_conf(struct mddev *mddev)
                kfree(conf->mirrors);
                safe_put_page(conf->tmppage);
                kfree(conf->poolinfo);
+               kfree(conf->nr_pending);
+               kfree(conf->nr_waiting);
+               kfree(conf->nr_queued);
+               kfree(conf->barrier);
                kfree(conf);
        }
        return ERR_PTR(err);
@@ -3085,6 +3217,10 @@ static void raid1_free(struct mddev *mddev, void *priv)
        kfree(conf->mirrors);
        safe_put_page(conf->tmppage);
        kfree(conf->poolinfo);
+       kfree(conf->nr_pending);
+       kfree(conf->nr_waiting);
+       kfree(conf->nr_queued);
+       kfree(conf->barrier);
        kfree(conf);
 }
 
index c52ef424a24b2313949971143a162c959e8f068d..dd22a37d0d8332e12785b9c270445aba09cce576 100644 (file)
@@ -1,6 +1,30 @@
 #ifndef _RAID1_H
 #define _RAID1_H
 
+/*
+ * each barrier unit size is 64MB fow now
+ * note: it must be larger than RESYNC_DEPTH
+ */
+#define BARRIER_UNIT_SECTOR_BITS       17
+#define BARRIER_UNIT_SECTOR_SIZE       (1<<17)
+/*
+ * In struct r1conf, the following members are related to I/O barrier
+ * buckets,
+ *     atomic_t        *nr_pending;
+ *     atomic_t        *nr_waiting;
+ *     atomic_t        *nr_queued;
+ *     atomic_t        *barrier;
+ * Each of them points to array of atomic_t variables, each array is
+ * designed to have BARRIER_BUCKETS_NR elements and occupy a single
+ * memory page. The data width of atomic_t variables is 4 bytes, equal
+ * to 1<<(ilog2(sizeof(atomic_t))), BARRIER_BUCKETS_NR_BITS is defined
+ * as (PAGE_SHIFT - ilog2(sizeof(int))) to make sure an array of
+ * atomic_t variables with BARRIER_BUCKETS_NR elements just exactly
+ * occupies a single memory page.
+ */
+#define BARRIER_BUCKETS_NR_BITS                (PAGE_SHIFT - ilog2(sizeof(atomic_t)))
+#define BARRIER_BUCKETS_NR             (1<<BARRIER_BUCKETS_NR_BITS)
+
 struct raid1_info {
        struct md_rdev  *rdev;
        sector_t        head_position;
@@ -35,25 +59,6 @@ struct r1conf {
                                                 */
        int                     raid_disks;
 
-       /* During resync, read_balancing is only allowed on the part
-        * of the array that has been resynced.  'next_resync' tells us
-        * where that is.
-        */
-       sector_t                next_resync;
-
-       /* When raid1 starts resync, we divide array into four partitions
-        * |---------|--------------|---------------------|-------------|
-        *        next_resync   start_next_window       end_window
-        * start_next_window = next_resync + NEXT_NORMALIO_DISTANCE
-        * end_window = start_next_window + NEXT_NORMALIO_DISTANCE
-        * current_window_requests means the count of normalIO between
-        *   start_next_window and end_window.
-        * next_window_requests means the count of normalIO after end_window.
-        * */
-       sector_t                start_next_window;
-       int                     current_window_requests;
-       int                     next_window_requests;
-
        spinlock_t              device_lock;
 
        /* list of 'struct r1bio' that need to be processed by raid1d,
@@ -79,10 +84,10 @@ struct r1conf {
         */
        wait_queue_head_t       wait_barrier;
        spinlock_t              resync_lock;
-       int                     nr_pending;
-       int                     nr_waiting;
-       int                     nr_queued;
-       int                     barrier;
+       atomic_t                *nr_pending;
+       atomic_t                *nr_waiting;
+       atomic_t                *nr_queued;
+       atomic_t                *barrier;
        int                     array_frozen;
 
        /* Set to 1 if a full sync is needed, (fresh device added).
@@ -135,7 +140,6 @@ struct r1bio {
                                                 * in this BehindIO request
                                                 */
        sector_t                sector;
-       sector_t                start_next_window;
        int                     sectors;
        unsigned long           state;
        struct mddev            *mddev;
@@ -185,4 +189,10 @@ enum r1bio_state {
        R1BIO_WriteError,
        R1BIO_FailFast,
 };
+
+static inline int sector_to_idx(sector_t sector)
+{
+       return hash_long(sector >> BARRIER_UNIT_SECTOR_BITS,
+                        BARRIER_BUCKETS_NR_BITS);
+}
 #endif
index 6bc5c2a85160e2654050716ef9270c1de3e903a3..063c43d83b72c2f0f753edb7b08f8dd608fa15ad 100644 (file)
@@ -1132,7 +1132,7 @@ read_again:
        }
        slot = r10_bio->read_slot;
 
-       read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+       read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
        bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
                 max_sectors);
 
@@ -1406,7 +1406,7 @@ retry_write:
                int d = r10_bio->devs[i].devnum;
                if (r10_bio->devs[i].bio) {
                        struct md_rdev *rdev = conf->mirrors[d].rdev;
-                       mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+                       mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
                        bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
                                 max_sectors);
                        r10_bio->devs[i].bio = mbio;
@@ -1457,7 +1457,7 @@ retry_write:
                                smp_mb();
                                rdev = conf->mirrors[d].rdev;
                        }
-                       mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+                       mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
                        bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
                                 max_sectors);
                        r10_bio->devs[i].repl_bio = mbio;
@@ -2565,7 +2565,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
                if (sectors > sect_to_write)
                        sectors = sect_to_write;
                /* Write at 'sector' for 'sectors' */
-               wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+               wbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
                bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
                wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector);
                wbio->bi_iter.bi_sector = wsector +
@@ -2641,8 +2641,7 @@ read_more:
                           mdname(mddev),
                           bdevname(rdev->bdev, b),
                           (unsigned long long)r10_bio->sector);
-       bio = bio_clone_mddev(r10_bio->master_bio,
-                             GFP_NOIO, mddev);
+       bio = bio_clone_fast(r10_bio->master_bio, GFP_NOIO, mddev->bio_set);
        bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
        r10_bio->devs[slot].bio = bio;
        r10_bio->devs[slot].rdev = rdev;
index 302dea3296ba5ccd07740365314f45d74df49ec2..3f307be01b10cc70eb7b08bc31b9a2a3717372b8 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/crc32c.h>
 #include <linux/random.h>
 #include <linux/kthread.h>
+#include <linux/types.h>
 #include "md.h"
 #include "raid5.h"
 #include "bitmap.h"
@@ -164,8 +165,59 @@ struct r5l_log {
        struct work_struct deferred_io_work;
        /* to disable write back during in degraded mode */
        struct work_struct disable_writeback_work;
+
+       /* to for chunk_aligned_read in writeback mode, details below */
+       spinlock_t tree_lock;
+       struct radix_tree_root big_stripe_tree;
 };
 
+/*
+ * Enable chunk_aligned_read() with write back cache.
+ *
+ * Each chunk may contain more than one stripe (for example, a 256kB
+ * chunk contains 64 4kB-page, so this chunk contain 64 stripes). For
+ * chunk_aligned_read, these stripes are grouped into one "big_stripe".
+ * For each big_stripe, we count how many stripes of this big_stripe
+ * are in the write back cache. These data are tracked in a radix tree
+ * (big_stripe_tree). We use radix_tree item pointer as the counter.
+ * r5c_tree_index() is used to calculate keys for the radix tree.
+ *
+ * chunk_aligned_read() calls r5c_big_stripe_cached() to look up
+ * big_stripe of each chunk in the tree. If this big_stripe is in the
+ * tree, chunk_aligned_read() aborts. This look up is protected by
+ * rcu_read_lock().
+ *
+ * It is necessary to remember whether a stripe is counted in
+ * big_stripe_tree. Instead of adding new flag, we reuses existing flags:
+ * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE. If either of these
+ * two flags are set, the stripe is counted in big_stripe_tree. This
+ * requires moving set_bit(STRIPE_R5C_PARTIAL_STRIPE) to
+ * r5c_try_caching_write(); and moving clear_bit of
+ * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE to
+ * r5c_finish_stripe_write_out().
+ */
+
+/*
+ * radix tree requests lowest 2 bits of data pointer to be 2b'00.
+ * So it is necessary to left shift the counter by 2 bits before using it
+ * as data pointer of the tree.
+ */
+#define R5C_RADIX_COUNT_SHIFT 2
+
+/*
+ * calculate key for big_stripe_tree
+ *
+ * sect: align_bi->bi_iter.bi_sector or sh->sector
+ */
+static inline sector_t r5c_tree_index(struct r5conf *conf,
+                                     sector_t sect)
+{
+       sector_t offset;
+
+       offset = sector_div(sect, conf->chunk_sectors);
+       return sect;
+}
+
 /*
  * an IO range starts from a meta data block and end at the next meta data
  * block. The io unit's the meta data block tracks data/parity followed it. io
@@ -337,17 +389,30 @@ void r5c_check_cached_full_stripe(struct r5conf *conf)
 /*
  * Total log space (in sectors) needed to flush all data in cache
  *
- * Currently, writing-out phase automatically includes all pending writes
- * to the same sector. So the reclaim of each stripe takes up to
- * (conf->raid_disks + 1) pages of log space.
+ * To avoid deadlock due to log space, it is necessary to reserve log
+ * space to flush critical stripes (stripes that occupying log space near
+ * last_checkpoint). This function helps check how much log space is
+ * required to flush all cached stripes.
  *
- * To totally avoid deadlock due to log space, the code reserves
- * (conf->raid_disks + 1) pages for each stripe in cache, which is not
- * necessary in most cases.
+ * To reduce log space requirements, two mechanisms are used to give cache
+ * flush higher priorities:
+ *    1. In handle_stripe_dirtying() and schedule_reconstruction(),
+ *       stripes ALREADY in journal can be flushed w/o pending writes;
+ *    2. In r5l_write_stripe() and r5c_cache_data(), stripes NOT in journal
+ *       can be delayed (r5l_add_no_space_stripe).
  *
- * To improve this, we will need writing-out phase to be able to NOT include
- * pending writes, which will reduce the requirement to
- * (conf->max_degraded + 1) pages per stripe in cache.
+ * In cache flush, the stripe goes through 1 and then 2. For a stripe that
+ * already passed 1, flushing it requires at most (conf->max_degraded + 1)
+ * pages of journal space. For stripes that has not passed 1, flushing it
+ * requires (conf->raid_disks + 1) pages of journal space. There are at
+ * most (conf->group_cnt + 1) stripe that passed 1. So total journal space
+ * required to flush all cached stripes (in pages) is:
+ *
+ *     (stripe_in_journal_count - group_cnt - 1) * (max_degraded + 1) +
+ *     (group_cnt + 1) * (raid_disks + 1)
+ * or
+ *     (stripe_in_journal_count) * (max_degraded + 1) +
+ *     (group_cnt + 1) * (raid_disks - max_degraded)
  */
 static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
 {
@@ -356,8 +421,9 @@ static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
        if (!r5c_is_writeback(log))
                return 0;
 
-       return BLOCK_SECTORS * (conf->raid_disks + 1) *
-               atomic_read(&log->stripe_in_journal_count);
+       return BLOCK_SECTORS *
+               ((conf->max_degraded + 1) * atomic_read(&log->stripe_in_journal_count) +
+                (conf->raid_disks - conf->max_degraded) * (conf->group_cnt + 1));
 }
 
 /*
@@ -412,16 +478,6 @@ void r5c_make_stripe_write_out(struct stripe_head *sh)
 
        if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
                atomic_inc(&conf->preread_active_stripes);
-
-       if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
-               BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
-               atomic_dec(&conf->r5c_cached_partial_stripes);
-       }
-
-       if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
-               BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
-               atomic_dec(&conf->r5c_cached_full_stripes);
-       }
 }
 
 static void r5c_handle_data_cached(struct stripe_head *sh)
@@ -1271,6 +1327,10 @@ static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
        atomic_inc(&conf->active_stripes);
        r5c_make_stripe_write_out(sh);
 
+       if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
+               atomic_inc(&conf->r5c_flushing_partial_stripes);
+       else
+               atomic_inc(&conf->r5c_flushing_full_stripes);
        raid5_release_stripe(sh);
 }
 
@@ -1313,12 +1373,16 @@ static void r5c_do_reclaim(struct r5conf *conf)
        unsigned long flags;
        int total_cached;
        int stripes_to_flush;
+       int flushing_partial, flushing_full;
 
        if (!r5c_is_writeback(log))
                return;
 
+       flushing_partial = atomic_read(&conf->r5c_flushing_partial_stripes);
+       flushing_full = atomic_read(&conf->r5c_flushing_full_stripes);
        total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
-               atomic_read(&conf->r5c_cached_full_stripes);
+               atomic_read(&conf->r5c_cached_full_stripes) -
+               flushing_full - flushing_partial;
 
        if (total_cached > conf->min_nr_stripes * 3 / 4 ||
            atomic_read(&conf->empty_inactive_list_nr) > 0)
@@ -1328,7 +1392,7 @@ static void r5c_do_reclaim(struct r5conf *conf)
                 */
                stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP;
        else if (total_cached > conf->min_nr_stripes * 1 / 2 ||
-                atomic_read(&conf->r5c_cached_full_stripes) >
+                atomic_read(&conf->r5c_cached_full_stripes) - flushing_full >
                 R5C_FULL_STRIPE_FLUSH_BATCH)
                /*
                 * if stripe cache pressure moderate, or if there is many full
@@ -1362,9 +1426,9 @@ static void r5c_do_reclaim(struct r5conf *conf)
                            !test_bit(STRIPE_HANDLE, &sh->state) &&
                            atomic_read(&sh->count) == 0) {
                                r5c_flush_stripe(conf, sh);
+                               if (count++ >= R5C_RECLAIM_STRIPE_GROUP)
+                                       break;
                        }
-                       if (count++ >= R5C_RECLAIM_STRIPE_GROUP)
-                               break;
                }
                spin_unlock(&conf->device_lock);
                spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
@@ -2320,6 +2384,10 @@ int r5c_try_caching_write(struct r5conf *conf,
        int i;
        struct r5dev *dev;
        int to_cache = 0;
+       void **pslot;
+       sector_t tree_index;
+       int ret;
+       uintptr_t refcount;
 
        BUG_ON(!r5c_is_writeback(log));
 
@@ -2364,6 +2432,44 @@ int r5c_try_caching_write(struct r5conf *conf,
                }
        }
 
+       /* if the stripe is not counted in big_stripe_tree, add it now */
+       if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
+           !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
+               tree_index = r5c_tree_index(conf, sh->sector);
+               spin_lock(&log->tree_lock);
+               pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
+                                              tree_index);
+               if (pslot) {
+                       refcount = (uintptr_t)radix_tree_deref_slot_protected(
+                               pslot, &log->tree_lock) >>
+                               R5C_RADIX_COUNT_SHIFT;
+                       radix_tree_replace_slot(
+                               &log->big_stripe_tree, pslot,
+                               (void *)((refcount + 1) << R5C_RADIX_COUNT_SHIFT));
+               } else {
+                       /*
+                        * this radix_tree_insert can fail safely, so no
+                        * need to call radix_tree_preload()
+                        */
+                       ret = radix_tree_insert(
+                               &log->big_stripe_tree, tree_index,
+                               (void *)(1 << R5C_RADIX_COUNT_SHIFT));
+                       if (ret) {
+                               spin_unlock(&log->tree_lock);
+                               r5c_make_stripe_write_out(sh);
+                               return -EAGAIN;
+                       }
+               }
+               spin_unlock(&log->tree_lock);
+
+               /*
+                * set STRIPE_R5C_PARTIAL_STRIPE, this shows the stripe is
+                * counted in the radix tree
+                */
+               set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state);
+               atomic_inc(&conf->r5c_cached_partial_stripes);
+       }
+
        for (i = disks; i--; ) {
                dev = &sh->dev[i];
                if (dev->towrite) {
@@ -2438,17 +2544,20 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
                                 struct stripe_head *sh,
                                 struct stripe_head_state *s)
 {
+       struct r5l_log *log = conf->log;
        int i;
        int do_wakeup = 0;
+       sector_t tree_index;
+       void **pslot;
+       uintptr_t refcount;
 
-       if (!conf->log ||
-           !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
+       if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
                return;
 
        WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
        clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
 
-       if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
+       if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
                return;
 
        for (i = sh->disks; i--; ) {
@@ -2470,12 +2579,45 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
        if (do_wakeup)
                wake_up(&conf->wait_for_overlap);
 
-       spin_lock_irq(&conf->log->stripe_in_journal_lock);
+       spin_lock_irq(&log->stripe_in_journal_lock);
        list_del_init(&sh->r5c);
-       spin_unlock_irq(&conf->log->stripe_in_journal_lock);
+       spin_unlock_irq(&log->stripe_in_journal_lock);
        sh->log_start = MaxSector;
-       atomic_dec(&conf->log->stripe_in_journal_count);
-       r5c_update_log_state(conf->log);
+
+       atomic_dec(&log->stripe_in_journal_count);
+       r5c_update_log_state(log);
+
+       /* stop counting this stripe in big_stripe_tree */
+       if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) ||
+           test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
+               tree_index = r5c_tree_index(conf, sh->sector);
+               spin_lock(&log->tree_lock);
+               pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
+                                              tree_index);
+               BUG_ON(pslot == NULL);
+               refcount = (uintptr_t)radix_tree_deref_slot_protected(
+                       pslot, &log->tree_lock) >>
+                       R5C_RADIX_COUNT_SHIFT;
+               if (refcount == 1)
+                       radix_tree_delete(&log->big_stripe_tree, tree_index);
+               else
+                       radix_tree_replace_slot(
+                               &log->big_stripe_tree, pslot,
+                               (void *)((refcount - 1) << R5C_RADIX_COUNT_SHIFT));
+               spin_unlock(&log->tree_lock);
+       }
+
+       if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
+               BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
+               atomic_dec(&conf->r5c_flushing_partial_stripes);
+               atomic_dec(&conf->r5c_cached_partial_stripes);
+       }
+
+       if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
+               BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
+               atomic_dec(&conf->r5c_flushing_full_stripes);
+               atomic_dec(&conf->r5c_cached_full_stripes);
+       }
 }
 
 int
@@ -2535,6 +2677,22 @@ r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
        return 0;
 }
 
+/* check whether this big stripe is in write back cache. */
+bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect)
+{
+       struct r5l_log *log = conf->log;
+       sector_t tree_index;
+       void *slot;
+
+       if (!log)
+               return false;
+
+       WARN_ON_ONCE(!rcu_read_lock_held());
+       tree_index = r5c_tree_index(conf, sect);
+       slot = radix_tree_lookup(&log->big_stripe_tree, tree_index);
+       return slot != NULL;
+}
+
 static int r5l_load_log(struct r5l_log *log)
 {
        struct md_rdev *rdev = log->rdev;
@@ -2681,6 +2839,9 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
        if (!log->meta_pool)
                goto out_mempool;
 
+       spin_lock_init(&log->tree_lock);
+       INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN);
+
        log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
                                                 log->rdev->mddev, "reclaim");
        if (!log->reclaim_thread)
index 6214e699342c87d7cdcb83e385530dff808fa918..2ce23b01dbb21da6ae17664df085c37c0a63e157 100644 (file)
@@ -281,13 +281,13 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
                                                atomic_dec(&conf->r5c_cached_partial_stripes);
                                        list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
                                        r5c_check_cached_full_stripe(conf);
-                               } else {
-                                       /* partial stripe */
-                                       if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE,
-                                                             &sh->state))
-                                               atomic_inc(&conf->r5c_cached_partial_stripes);
+                               } else
+                                       /*
+                                        * STRIPE_R5C_PARTIAL_STRIPE is set in
+                                        * r5c_try_caching_write(). No need to
+                                        * set it again.
+                                        */
                                        list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
-                               }
                        }
                }
        }
@@ -353,17 +353,15 @@ static void release_inactive_stripe_list(struct r5conf *conf,
 static int release_stripe_list(struct r5conf *conf,
                               struct list_head *temp_inactive_list)
 {
-       struct stripe_head *sh;
+       struct stripe_head *sh, *t;
        int count = 0;
        struct llist_node *head;
 
        head = llist_del_all(&conf->released_stripes);
        head = llist_reverse_order(head);
-       while (head) {
+       llist_for_each_entry_safe(sh, t, head, release_list) {
                int hash;
 
-               sh = llist_entry(head, struct stripe_head, release_list);
-               head = llist_next(head);
                /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
                smp_mb();
                clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state);
@@ -863,6 +861,43 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
        return 1;
 }
 
+static void flush_deferred_bios(struct r5conf *conf)
+{
+       struct bio_list tmp;
+       struct bio *bio;
+
+       if (!conf->batch_bio_dispatch || !conf->group_cnt)
+               return;
+
+       bio_list_init(&tmp);
+       spin_lock(&conf->pending_bios_lock);
+       bio_list_merge(&tmp, &conf->pending_bios);
+       bio_list_init(&conf->pending_bios);
+       spin_unlock(&conf->pending_bios_lock);
+
+       while ((bio = bio_list_pop(&tmp)))
+               generic_make_request(bio);
+}
+
+static void defer_bio_issue(struct r5conf *conf, struct bio *bio)
+{
+       /*
+        * change group_cnt will drain all bios, so this is safe
+        *
+        * A read generally means a read-modify-write, which usually means a
+        * randwrite, so we don't delay it
+        */
+       if (!conf->batch_bio_dispatch || !conf->group_cnt ||
+           bio_op(bio) == REQ_OP_READ) {
+               generic_make_request(bio);
+               return;
+       }
+       spin_lock(&conf->pending_bios_lock);
+       bio_list_add(&conf->pending_bios, bio);
+       spin_unlock(&conf->pending_bios_lock);
+       md_wakeup_thread(conf->mddev->thread);
+}
+
 static void
 raid5_end_read_request(struct bio *bi);
 static void
@@ -1043,7 +1078,7 @@ again:
                                trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
                                                      bi, disk_devt(conf->mddev->gendisk),
                                                      sh->dev[i].sector);
-                       generic_make_request(bi);
+                       defer_bio_issue(conf, bi);
                }
                if (rrdev) {
                        if (s->syncing || s->expanding || s->expanded
@@ -1088,7 +1123,7 @@ again:
                                trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
                                                      rbi, disk_devt(conf->mddev->gendisk),
                                                      sh->dev[i].sector);
-                       generic_make_request(rbi);
+                       defer_bio_issue(conf, rbi);
                }
                if (!rdev && !rrdev) {
                        if (op_is_write(op))
@@ -2914,12 +2949,36 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
  *      like to flush data in journal to RAID disks first, so complex rmw
  *      is handled in the write patch (handle_stripe_dirtying).
  *
+ *   2. when journal space is critical (R5C_LOG_CRITICAL=1)
+ *
+ *      It is important to be able to flush all stripes in raid5-cache.
+ *      Therefore, we need reserve some space on the journal device for
+ *      these flushes. If flush operation includes pending writes to the
+ *      stripe, we need to reserve (conf->raid_disk + 1) pages per stripe
+ *      for the flush out. If we exclude these pending writes from flush
+ *      operation, we only need (conf->max_degraded + 1) pages per stripe.
+ *      Therefore, excluding pending writes in these cases enables more
+ *      efficient use of the journal device.
+ *
+ *      Note: To make sure the stripe makes progress, we only delay
+ *      towrite for stripes with data already in journal (injournal > 0).
+ *      When LOG_CRITICAL, stripes with injournal == 0 will be sent to
+ *      no_space_stripes list.
+ *
  */
-static inline bool delay_towrite(struct r5dev *dev,
-                                  struct stripe_head_state *s)
+static inline bool delay_towrite(struct r5conf *conf,
+                                struct r5dev *dev,
+                                struct stripe_head_state *s)
 {
-       return !test_bit(R5_OVERWRITE, &dev->flags) &&
-               !test_bit(R5_Insync, &dev->flags) && s->injournal;
+       /* case 1 above */
+       if (!test_bit(R5_OVERWRITE, &dev->flags) &&
+           !test_bit(R5_Insync, &dev->flags) && s->injournal)
+               return true;
+       /* case 2 above */
+       if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
+           s->injournal > 0)
+               return true;
+       return false;
 }
 
 static void
@@ -2942,7 +3001,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
                for (i = disks; i--; ) {
                        struct r5dev *dev = &sh->dev[i];
 
-                       if (dev->towrite && !delay_towrite(dev, s)) {
+                       if (dev->towrite && !delay_towrite(conf, dev, s)) {
                                set_bit(R5_LOCKED, &dev->flags);
                                set_bit(R5_Wantdrain, &dev->flags);
                                if (!expand)
@@ -3694,7 +3753,7 @@ static int handle_stripe_dirtying(struct r5conf *conf,
        } else for (i = disks; i--; ) {
                /* would I have to read this buffer for read_modify_write */
                struct r5dev *dev = &sh->dev[i];
-               if (((dev->towrite && !delay_towrite(dev, s)) ||
+               if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
                     i == sh->pd_idx || i == sh->qd_idx ||
                     test_bit(R5_InJournal, &dev->flags)) &&
                    !test_bit(R5_LOCKED, &dev->flags) &&
@@ -3718,8 +3777,8 @@ static int handle_stripe_dirtying(struct r5conf *conf,
                }
        }
 
-       pr_debug("for sector %llu, rmw=%d rcw=%d\n",
-               (unsigned long long)sh->sector, rmw, rcw);
+       pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n",
+                (unsigned long long)sh->sector, sh->state, rmw, rcw);
        set_bit(STRIPE_HANDLE, &sh->state);
        if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
                /* prefer read-modify-write, but need to get some data */
@@ -3759,7 +3818,7 @@ static int handle_stripe_dirtying(struct r5conf *conf,
 
                for (i = disks; i--; ) {
                        struct r5dev *dev = &sh->dev[i];
-                       if (((dev->towrite && !delay_towrite(dev, s)) ||
+                       if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
                             i == sh->pd_idx || i == sh->qd_idx ||
                             test_bit(R5_InJournal, &dev->flags)) &&
                            !test_bit(R5_LOCKED, &dev->flags) &&
@@ -4995,9 +5054,9 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
                return 0;
        }
        /*
-        * use bio_clone_mddev to make a copy of the bio
+        * use bio_clone_fast to make a copy of the bio
         */
-       align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev);
+       align_bi = bio_clone_fast(raid_bio, GFP_NOIO, mddev->bio_set);
        if (!align_bi)
                return 0;
        /*
@@ -5025,6 +5084,13 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
                      rdev->recovery_offset >= end_sector)))
                        rdev = NULL;
        }
+
+       if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) {
+               rcu_read_unlock();
+               bio_put(align_bi);
+               return 0;
+       }
+
        if (rdev) {
                sector_t first_bad;
                int bad_sectors;
@@ -5381,7 +5447,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
         * data on failed drives.
         */
        if (rw == READ && mddev->degraded == 0 &&
-           !r5c_is_writeback(conf->log) &&
            mddev->reshape_position == MaxSector) {
                bi = chunk_aligned_read(mddev, bi);
                if (!bi)
@@ -6126,6 +6191,8 @@ static void raid5d(struct md_thread *thread)
                mutex_unlock(&conf->cache_size_mutex);
        }
 
+       flush_deferred_bios(conf);
+
        r5l_flush_stripe_to_raid(conf->log);
 
        async_tx_issue_pending_all();
@@ -6711,6 +6778,18 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        atomic_set(&conf->active_stripes, 0);
        atomic_set(&conf->preread_active_stripes, 0);
        atomic_set(&conf->active_aligned_reads, 0);
+       bio_list_init(&conf->pending_bios);
+       spin_lock_init(&conf->pending_bios_lock);
+       conf->batch_bio_dispatch = true;
+       rdev_for_each(rdev, mddev) {
+               if (test_bit(Journal, &rdev->flags))
+                       continue;
+               if (blk_queue_nonrot(bdev_get_queue(rdev->bdev))) {
+                       conf->batch_bio_dispatch = false;
+                       break;
+               }
+       }
+
        conf->bypass_threshold = BYPASS_THRESHOLD;
        conf->recovery_disabled = mddev->recovery_disabled - 1;
 
@@ -6757,6 +6836,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        INIT_LIST_HEAD(&conf->r5c_full_stripe_list);
        atomic_set(&conf->r5c_cached_partial_stripes, 0);
        INIT_LIST_HEAD(&conf->r5c_partial_stripe_list);
+       atomic_set(&conf->r5c_flushing_full_stripes, 0);
+       atomic_set(&conf->r5c_flushing_partial_stripes, 0);
 
        conf->level = mddev->new_level;
        conf->chunk_sectors = mddev->new_chunk_sectors;
index 1440fa26e29629c4f9acc098f0fa9035f5ff1d1a..4bb27b97bf6bc48f6362461592edab19f1024140 100644 (file)
@@ -663,6 +663,8 @@ struct r5conf {
        struct list_head        r5c_full_stripe_list;
        atomic_t                r5c_cached_partial_stripes;
        struct list_head        r5c_partial_stripe_list;
+       atomic_t                r5c_flushing_full_stripes;
+       atomic_t                r5c_flushing_partial_stripes;
 
        atomic_t                empty_inactive_list_nr;
        struct llist_head       released_stripes;
@@ -684,6 +686,10 @@ struct r5conf {
        int                     group_cnt;
        int                     worker_cnt_per_group;
        struct r5l_log          *log;
+
+       struct bio_list         pending_bios;
+       spinlock_t              pending_bios_lock;
+       bool                    batch_bio_dispatch;
 };
 
 
@@ -788,4 +794,5 @@ extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
 extern void r5c_check_cached_full_stripe(struct r5conf *conf);
 extern struct md_sysfs_entry r5c_journal_mode;
 extern void r5c_update_on_rdev_error(struct mddev *mddev);
+extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
 #endif
index 1620a5d2757d38ceea4662fe1c885fdccd201948..0889fc81ce9e47e4c63731588e5732123a8aabf6 100644 (file)
@@ -2671,7 +2671,7 @@ static int mac80211_hwsim_new_radio(struct genl_info *info,
 
        tasklet_hrtimer_init(&data->beacon_timer,
                             mac80211_hwsim_beacon,
-                            CLOCK_MONOTONIC_RAW, HRTIMER_MODE_ABS);
+                            CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 
        spin_lock_bh(&hwsim_radio_lock);
        list_add_tail(&data->list, &hwsim_radios);
index 44a1a257e0b598738765ab7001e8ec862a00cdc0..25ec4e58522058f70a302ad02811abccf5cb4e1a 100644 (file)
@@ -26,6 +26,7 @@
 #include <linux/ptrace.h>
 #include <linux/nvme_ioctl.h>
 #include <linux/t10-pi.h>
+#include <linux/pm_qos.h>
 #include <scsi/sg.h>
 #include <asm/unaligned.h>
 
@@ -56,6 +57,11 @@ EXPORT_SYMBOL_GPL(nvme_max_retries);
 static int nvme_char_major;
 module_param(nvme_char_major, int, 0);
 
+static unsigned long default_ps_max_latency_us = 25000;
+module_param(default_ps_max_latency_us, ulong, 0644);
+MODULE_PARM_DESC(default_ps_max_latency_us,
+                "max power saving latency for new devices; use PM QOS to change per device");
+
 static LIST_HEAD(nvme_ctrl_list);
 static DEFINE_SPINLOCK(dev_list_lock);
 
@@ -560,7 +566,7 @@ int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
 
        /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
        c.identify.opcode = nvme_admin_identify;
-       c.identify.cns = cpu_to_le32(NVME_ID_CNS_CTRL);
+       c.identify.cns = NVME_ID_CNS_CTRL;
 
        *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
        if (!*id)
@@ -578,7 +584,7 @@ static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *n
        struct nvme_command c = { };
 
        c.identify.opcode = nvme_admin_identify;
-       c.identify.cns = cpu_to_le32(NVME_ID_CNS_NS_ACTIVE_LIST);
+       c.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST;
        c.identify.nsid = cpu_to_le32(nsid);
        return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000);
 }
@@ -590,8 +596,9 @@ int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid,
        int error;
 
        /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
-       c.identify.opcode = nvme_admin_identify,
-       c.identify.nsid = cpu_to_le32(nsid),
+       c.identify.opcode = nvme_admin_identify;
+       c.identify.nsid = cpu_to_le32(nsid);
+       c.identify.cns = NVME_ID_CNS_NS;
 
        *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
        if (!*id)
@@ -1251,6 +1258,176 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
        blk_queue_write_cache(q, vwc, vwc);
 }
 
+static void nvme_configure_apst(struct nvme_ctrl *ctrl)
+{
+       /*
+        * APST (Autonomous Power State Transition) lets us program a
+        * table of power state transitions that the controller will
+        * perform automatically.  We configure it with a simple
+        * heuristic: we are willing to spend at most 2% of the time
+        * transitioning between power states.  Therefore, when running
+        * in any given state, we will enter the next lower-power
+        * non-operational state after waiting 100 * (enlat + exlat)
+        * microseconds, as long as that state's total latency is under
+        * the requested maximum latency.
+        *
+        * We will not autonomously enter any non-operational state for
+        * which the total latency exceeds ps_max_latency_us.  Users
+        * can set ps_max_latency_us to zero to turn off APST.
+        */
+
+       unsigned apste;
+       struct nvme_feat_auto_pst *table;
+       int ret;
+
+       /*
+        * If APST isn't supported or if we haven't been initialized yet,
+        * then don't do anything.
+        */
+       if (!ctrl->apsta)
+               return;
+
+       if (ctrl->npss > 31) {
+               dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
+               return;
+       }
+
+       table = kzalloc(sizeof(*table), GFP_KERNEL);
+       if (!table)
+               return;
+
+       if (ctrl->ps_max_latency_us == 0) {
+               /* Turn off APST. */
+               apste = 0;
+       } else {
+               __le64 target = cpu_to_le64(0);
+               int state;
+
+               /*
+                * Walk through all states from lowest- to highest-power.
+                * According to the spec, lower-numbered states use more
+                * power.  NPSS, despite the name, is the index of the
+                * lowest-power state, not the number of states.
+                */
+               for (state = (int)ctrl->npss; state >= 0; state--) {
+                       u64 total_latency_us, transition_ms;
+
+                       if (target)
+                               table->entries[state] = target;
+
+                       /*
+                        * Is this state a useful non-operational state for
+                        * higher-power states to autonomously transition to?
+                        */
+                       if (!(ctrl->psd[state].flags &
+                             NVME_PS_FLAGS_NON_OP_STATE))
+                               continue;
+
+                       total_latency_us =
+                               (u64)le32_to_cpu(ctrl->psd[state].entry_lat) +
+                               + le32_to_cpu(ctrl->psd[state].exit_lat);
+                       if (total_latency_us > ctrl->ps_max_latency_us)
+                               continue;
+
+                       /*
+                        * This state is good.  Use it as the APST idle
+                        * target for higher power states.
+                        */
+                       transition_ms = total_latency_us + 19;
+                       do_div(transition_ms, 20);
+                       if (transition_ms > (1 << 24) - 1)
+                               transition_ms = (1 << 24) - 1;
+
+                       target = cpu_to_le64((state << 3) |
+                                            (transition_ms << 8));
+               }
+
+               apste = 1;
+       }
+
+       ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
+                               table, sizeof(*table), NULL);
+       if (ret)
+               dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
+
+       kfree(table);
+}
+
+static void nvme_set_latency_tolerance(struct device *dev, s32 val)
+{
+       struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+       u64 latency;
+
+       switch (val) {
+       case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
+       case PM_QOS_LATENCY_ANY:
+               latency = U64_MAX;
+               break;
+
+       default:
+               latency = val;
+       }
+
+       if (ctrl->ps_max_latency_us != latency) {
+               ctrl->ps_max_latency_us = latency;
+               nvme_configure_apst(ctrl);
+       }
+}
+
+struct nvme_core_quirk_entry {
+       /*
+        * NVMe model and firmware strings are padded with spaces.  For
+        * simplicity, strings in the quirk table are padded with NULLs
+        * instead.
+        */
+       u16 vid;
+       const char *mn;
+       const char *fr;
+       unsigned long quirks;
+};
+
+static const struct nvme_core_quirk_entry core_quirks[] = {
+       /*
+        * Seen on a Samsung "SM951 NVMe SAMSUNG 256GB": using APST causes
+        * the controller to go out to lunch.  It dies when the watchdog
+        * timer reads CSTS and gets 0xffffffff.
+        */
+       {
+               .vid = 0x144d,
+               .fr = "BXW75D0Q",
+               .quirks = NVME_QUIRK_NO_APST,
+       },
+};
+
+/* match is null-terminated but idstr is space-padded. */
+static bool string_matches(const char *idstr, const char *match, size_t len)
+{
+       size_t matchlen;
+
+       if (!match)
+               return true;
+
+       matchlen = strlen(match);
+       WARN_ON_ONCE(matchlen > len);
+
+       if (memcmp(idstr, match, matchlen))
+               return false;
+
+       for (; matchlen < len; matchlen++)
+               if (idstr[matchlen] != ' ')
+                       return false;
+
+       return true;
+}
+
+static bool quirk_matches(const struct nvme_id_ctrl *id,
+                         const struct nvme_core_quirk_entry *q)
+{
+       return q->vid == le16_to_cpu(id->vid) &&
+               string_matches(id->mn, q->mn, sizeof(id->mn)) &&
+               string_matches(id->fr, q->fr, sizeof(id->fr));
+}
+
 /*
  * Initialize the cached copies of the Identify data and various controller
  * register in our nvme_ctrl structure.  This should be called as soon as
@@ -1262,6 +1439,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
        u64 cap;
        int ret, page_shift;
        u32 max_hw_sectors;
+       u8 prev_apsta;
 
        ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
        if (ret) {
@@ -1285,6 +1463,24 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
                return -EIO;
        }
 
+       if (!ctrl->identified) {
+               /*
+                * Check for quirks.  Quirk can depend on firmware version,
+                * so, in principle, the set of quirks present can change
+                * across a reset.  As a possible future enhancement, we
+                * could re-scan for quirks every time we reinitialize
+                * the device, but we'd have to make sure that the driver
+                * behaves intelligently if the quirks change.
+                */
+
+               int i;
+
+               for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
+                       if (quirk_matches(id, &core_quirks[i]))
+                               ctrl->quirks |= core_quirks[i].quirks;
+               }
+       }
+
        ctrl->oacs = le16_to_cpu(id->oacs);
        ctrl->vid = le16_to_cpu(id->vid);
        ctrl->oncs = le16_to_cpup(&id->oncs);
@@ -1305,6 +1501,11 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
        ctrl->sgls = le32_to_cpu(id->sgls);
        ctrl->kas = le16_to_cpu(id->kas);
 
+       ctrl->npss = id->npss;
+       prev_apsta = ctrl->apsta;
+       ctrl->apsta = (ctrl->quirks & NVME_QUIRK_NO_APST) ? 0 : id->apsta;
+       memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
+
        if (ctrl->ops->is_fabrics) {
                ctrl->icdoff = le16_to_cpu(id->icdoff);
                ctrl->ioccsz = le32_to_cpu(id->ioccsz);
@@ -1328,6 +1529,16 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
        }
 
        kfree(id);
+
+       if (ctrl->apsta && !prev_apsta)
+               dev_pm_qos_expose_latency_tolerance(ctrl->device);
+       else if (!ctrl->apsta && prev_apsta)
+               dev_pm_qos_hide_latency_tolerance(ctrl->device);
+
+       nvme_configure_apst(ctrl);
+
+       ctrl->identified = true;
+
        return ret;
 }
 EXPORT_SYMBOL_GPL(nvme_init_identify);
@@ -1577,6 +1788,29 @@ static ssize_t nvme_sysfs_show_transport(struct device *dev,
 }
 static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL);
 
+static ssize_t nvme_sysfs_show_state(struct device *dev,
+                                    struct device_attribute *attr,
+                                    char *buf)
+{
+       struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+       static const char *const state_name[] = {
+               [NVME_CTRL_NEW]         = "new",
+               [NVME_CTRL_LIVE]        = "live",
+               [NVME_CTRL_RESETTING]   = "resetting",
+               [NVME_CTRL_RECONNECTING]= "reconnecting",
+               [NVME_CTRL_DELETING]    = "deleting",
+               [NVME_CTRL_DEAD]        = "dead",
+       };
+
+       if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) &&
+           state_name[ctrl->state])
+               return sprintf(buf, "%s\n", state_name[ctrl->state]);
+
+       return sprintf(buf, "unknown state\n");
+}
+
+static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL);
+
 static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev,
                                         struct device_attribute *attr,
                                         char *buf)
@@ -1609,6 +1843,7 @@ static struct attribute *nvme_dev_attrs[] = {
        &dev_attr_transport.attr,
        &dev_attr_subsysnqn.attr,
        &dev_attr_address.attr,
+       &dev_attr_state.attr,
        NULL
 };
 
@@ -2065,6 +2300,14 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
        list_add_tail(&ctrl->node, &nvme_ctrl_list);
        spin_unlock(&dev_list_lock);
 
+       /*
+        * Initialize latency tolerance controls.  The sysfs files won't
+        * be visible to userspace unless the device actually supports APST.
+        */
+       ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
+       dev_pm_qos_update_user_latency_tolerance(ctrl->device,
+               min(default_ps_max_latency_us, (unsigned long)S32_MAX));
+
        return 0;
 out_release_instance:
        nvme_release_instance(ctrl);
@@ -2090,9 +2333,9 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
                 * Revalidating a dead namespace sets capacity to 0. This will
                 * end buffered writers dirtying pages that can't be synced.
                 */
-               if (ns->disk && !test_and_set_bit(NVME_NS_DEAD, &ns->flags))
-                       revalidate_disk(ns->disk);
-
+               if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags))
+                       continue;
+               revalidate_disk(ns->disk);
                blk_set_queue_dying(ns->queue);
                blk_mq_abort_requeue_list(ns->queue);
                blk_mq_start_stopped_hw_queues(ns->queue, true);
index 916d1360805964cbe6a095576ba2049af52384f8..5b7386f69f4de5571112bcc504134c8d99744793 100644 (file)
@@ -480,11 +480,16 @@ EXPORT_SYMBOL_GPL(nvmf_connect_io_queue);
  * being implemented to the common NVMe fabrics library. Part of
  * the overall init sequence of starting up a fabrics driver.
  */
-void nvmf_register_transport(struct nvmf_transport_ops *ops)
+int nvmf_register_transport(struct nvmf_transport_ops *ops)
 {
+       if (!ops->create_ctrl)
+               return -EINVAL;
+
        mutex_lock(&nvmf_transports_mutex);
        list_add_tail(&ops->entry, &nvmf_transports);
        mutex_unlock(&nvmf_transports_mutex);
+
+       return 0;
 }
 EXPORT_SYMBOL_GPL(nvmf_register_transport);
 
index 924145c979f136e167c72448b5998df15479a670..156018182ce43bbf70fe34fa1ff71b6df2e4456b 100644 (file)
@@ -128,7 +128,7 @@ int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val);
 int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val);
 int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl);
 int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid);
-void nvmf_register_transport(struct nvmf_transport_ops *ops);
+int nvmf_register_transport(struct nvmf_transport_ops *ops);
 void nvmf_unregister_transport(struct nvmf_transport_ops *ops);
 void nvmf_free_options(struct nvmf_ctrl_options *opts);
 const char *nvmf_get_subsysnqn(struct nvme_ctrl *ctrl);
index fb51a8de9b29a770c93a34dfca7b3264e8dd0a6a..9690beb15e69ab47bb04345da5f142ec56141035 100644 (file)
@@ -2353,18 +2353,6 @@ __nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 
        /* sanity checks */
 
-       /* FC-NVME supports 64-byte SQE only */
-       if (ctrl->ctrl.ioccsz != 4) {
-               dev_err(ctrl->ctrl.device, "ioccsz %d is not supported!\n",
-                               ctrl->ctrl.ioccsz);
-               goto out_remove_admin_queue;
-       }
-       /* FC-NVME supports 16-byte CQE only */
-       if (ctrl->ctrl.iorcsz != 1) {
-               dev_err(ctrl->ctrl.device, "iorcsz %d is not supported!\n",
-                               ctrl->ctrl.iorcsz);
-               goto out_remove_admin_queue;
-       }
        /* FC-NVME does not have other data in the capsule */
        if (ctrl->ctrl.icdoff) {
                dev_err(ctrl->ctrl.device, "icdoff %d is not supported!\n",
@@ -2562,8 +2550,7 @@ static int __init nvme_fc_init_module(void)
        if (!nvme_fc_wq)
                return -ENOMEM;
 
-       nvmf_register_transport(&nvme_fc_transport);
-       return 0;
+       return nvmf_register_transport(&nvme_fc_transport);
 }
 
 static void __exit nvme_fc_exit_module(void)
index 14cfc6f7facb240a96630a637ccc60bd45911302..a3da1e90b99dbf1bb04177379c65567c992c2dfd 100644 (file)
@@ -78,6 +78,11 @@ enum nvme_quirks {
         * readiness, which is done by reading the NVME_CSTS_RDY bit.
         */
        NVME_QUIRK_DELAY_BEFORE_CHK_RDY         = (1 << 3),
+
+       /*
+        * APST should not be used.
+        */
+       NVME_QUIRK_NO_APST                      = (1 << 4),
 };
 
 /*
@@ -112,6 +117,7 @@ enum nvme_ctrl_state {
 
 struct nvme_ctrl {
        enum nvme_ctrl_state state;
+       bool identified;
        spinlock_t lock;
        const struct nvme_ctrl_ops *ops;
        struct request_queue *admin_q;
@@ -147,13 +153,19 @@ struct nvme_ctrl {
        u32 vs;
        u32 sgls;
        u16 kas;
+       u8 npss;
+       u8 apsta;
        unsigned int kato;
        bool subsystem;
        unsigned long quirks;
+       struct nvme_id_power_state psd[32];
        struct work_struct scan_work;
        struct work_struct async_event_work;
        struct delayed_work ka_work;
 
+       /* Power saving configuration */
+       u64 ps_max_latency_us;
+
        /* Fabrics only */
        u16 sqsize;
        u32 ioccsz;
index ddc51adb594d0ba3df2e800b9247e2b3cb161847..57a1af52b06e6674a0a3c84564cb31257db37c6e 100644 (file)
@@ -613,10 +613,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 
        spin_lock_irq(&nvmeq->q_lock);
        if (unlikely(nvmeq->cq_vector < 0)) {
-               if (ns && !test_bit(NVME_NS_DEAD, &ns->flags))
-                       ret = BLK_MQ_RQ_QUEUE_BUSY;
-               else
-                       ret = BLK_MQ_RQ_QUEUE_ERROR;
+               ret = BLK_MQ_RQ_QUEUE_ERROR;
                spin_unlock_irq(&nvmeq->q_lock);
                goto out_cleanup_iod;
        }
@@ -1739,7 +1736,7 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
        if (dev->ctrl.admin_q)
                blk_put_queue(dev->ctrl.admin_q);
        kfree(dev->queues);
-       kfree(dev->ctrl.opal_dev);
+       free_opal_dev(dev->ctrl.opal_dev);
        kfree(dev);
 }
 
@@ -1789,14 +1786,17 @@ static void nvme_reset_work(struct work_struct *work)
        if (result)
                goto out;
 
-       if ((dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) && !dev->ctrl.opal_dev) {
-               dev->ctrl.opal_dev =
-                       init_opal_dev(&dev->ctrl, &nvme_sec_submit);
+       if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) {
+               if (!dev->ctrl.opal_dev)
+                       dev->ctrl.opal_dev =
+                               init_opal_dev(&dev->ctrl, &nvme_sec_submit);
+               else if (was_suspend)
+                       opal_unlock_from_suspend(dev->ctrl.opal_dev);
+       } else {
+               free_opal_dev(dev->ctrl.opal_dev);
+               dev->ctrl.opal_dev = NULL;
        }
 
-       if (was_suspend)
-               opal_unlock_from_suspend(dev->ctrl.opal_dev);
-
        result = nvme_setup_io_queues(dev);
        if (result)
                goto out;
@@ -2001,8 +2001,10 @@ static void nvme_remove(struct pci_dev *pdev)
 
        pci_set_drvdata(pdev, NULL);
 
-       if (!pci_device_is_present(pdev))
+       if (!pci_device_is_present(pdev)) {
                nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
+               nvme_dev_disable(dev, false);
+       }
 
        flush_work(&dev->reset_work);
        nvme_uninit_ctrl(&dev->ctrl);
@@ -2121,6 +2123,7 @@ static const struct pci_device_id nvme_id_table[] = {
                .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
        { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
        { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
+       { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
        { 0, }
 };
 MODULE_DEVICE_TABLE(pci, nvme_id_table);
index a75e95d42b3febf5edba65c4ba7ed5181aa08c20..49b2121af689d37509ad965fa2c7a8851b48fefe 100644 (file)
 
 #define NVME_RDMA_MAX_INLINE_SEGMENTS  1
 
-static const char *const nvme_rdma_cm_status_strs[] = {
-       [NVME_RDMA_CM_INVALID_LEN]      = "invalid length",
-       [NVME_RDMA_CM_INVALID_RECFMT]   = "invalid record format",
-       [NVME_RDMA_CM_INVALID_QID]      = "invalid queue ID",
-       [NVME_RDMA_CM_INVALID_HSQSIZE]  = "invalid host SQ size",
-       [NVME_RDMA_CM_INVALID_HRQSIZE]  = "invalid host RQ size",
-       [NVME_RDMA_CM_NO_RSC]           = "resource not found",
-       [NVME_RDMA_CM_INVALID_IRD]      = "invalid IRD",
-       [NVME_RDMA_CM_INVALID_ORD]      = "Invalid ORD",
-};
-
-static const char *nvme_rdma_cm_msg(enum nvme_rdma_cm_status status)
-{
-       size_t index = status;
-
-       if (index < ARRAY_SIZE(nvme_rdma_cm_status_strs) &&
-           nvme_rdma_cm_status_strs[index])
-               return nvme_rdma_cm_status_strs[index];
-       else
-               return "unrecognized reason";
-};
-
 /*
  * We handle AEN commands ourselves and don't even let the
  * block layer know about them.
@@ -155,6 +133,10 @@ struct nvme_rdma_ctrl {
                struct sockaddr addr;
                struct sockaddr_in addr_in;
        };
+       union {
+               struct sockaddr src_addr;
+               struct sockaddr_in src_addr_in;
+       };
 
        struct nvme_ctrl        ctrl;
 };
@@ -567,6 +549,7 @@ static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl,
                int idx, size_t queue_size)
 {
        struct nvme_rdma_queue *queue;
+       struct sockaddr *src_addr = NULL;
        int ret;
 
        queue = &ctrl->queues[idx];
@@ -589,7 +572,10 @@ static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl,
        }
 
        queue->cm_error = -ETIMEDOUT;
-       ret = rdma_resolve_addr(queue->cm_id, NULL, &ctrl->addr,
+       if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR)
+               src_addr = &ctrl->src_addr;
+
+       ret = rdma_resolve_addr(queue->cm_id, src_addr, &ctrl->addr,
                        NVME_RDMA_CONNECT_TIMEOUT_MS);
        if (ret) {
                dev_info(ctrl->ctrl.device,
@@ -1905,6 +1891,16 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
                goto out_free_ctrl;
        }
 
+       if (opts->mask & NVMF_OPT_HOST_TRADDR) {
+               ret = nvme_rdma_parse_ipaddr(&ctrl->src_addr_in,
+                               opts->host_traddr);
+               if (ret) {
+                       pr_err("malformed src IP address passed: %s\n",
+                              opts->host_traddr);
+                       goto out_free_ctrl;
+               }
+       }
+
        if (opts->mask & NVMF_OPT_TRSVCID) {
                u16 port;
 
@@ -2016,7 +2012,8 @@ out_free_ctrl:
 static struct nvmf_transport_ops nvme_rdma_transport = {
        .name           = "rdma",
        .required_opts  = NVMF_OPT_TRADDR,
-       .allowed_opts   = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY,
+       .allowed_opts   = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
+                         NVMF_OPT_HOST_TRADDR,
        .create_ctrl    = nvme_rdma_create_ctrl,
 };
 
@@ -2063,8 +2060,7 @@ static int __init nvme_rdma_init_module(void)
                return ret;
        }
 
-       nvmf_register_transport(&nvme_rdma_transport);
-       return 0;
+       return nvmf_register_transport(&nvme_rdma_transport);
 }
 
 static void __exit nvme_rdma_cleanup_module(void)
index 95ae52390478fe62fdb59605ee2c7a6d0583a919..94e524fea5687b8de8ebf68d6469676cae4fa08d 100644 (file)
@@ -41,7 +41,7 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
        ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->get_log_page.nsid);
        if (!ns) {
                status = NVME_SC_INVALID_NS;
-               pr_err("nvmet : Counld not find namespace id : %d\n",
+               pr_err("nvmet : Could not find namespace id : %d\n",
                                le32_to_cpu(req->cmd->get_log_page.nsid));
                goto out;
        }
@@ -509,7 +509,7 @@ int nvmet_parse_admin_cmd(struct nvmet_req *req)
                break;
        case nvme_admin_identify:
                req->data_len = 4096;
-               switch (le32_to_cpu(cmd->identify.cns)) {
+               switch (cmd->identify.cns) {
                case NVME_ID_CNS_NS:
                        req->execute = nvmet_execute_identify_ns;
                        return 0;
index fc5ba2f9e15f47fe8bd13795bdb9d6caaa532b93..5267ce20c12d48b062d84bf7d43ad73585694214 100644 (file)
@@ -17,6 +17,7 @@
 #include "nvmet.h"
 
 static struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX];
+static DEFINE_IDA(cntlid_ida);
 
 /*
  * This read/write semaphore is used to synchronize access to configuration
@@ -749,7 +750,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
        if (!ctrl->sqs)
                goto out_free_cqs;
 
-       ret = ida_simple_get(&subsys->cntlid_ida,
+       ret = ida_simple_get(&cntlid_ida,
                             NVME_CNTLID_MIN, NVME_CNTLID_MAX,
                             GFP_KERNEL);
        if (ret < 0) {
@@ -819,7 +820,7 @@ static void nvmet_ctrl_free(struct kref *ref)
        flush_work(&ctrl->async_event_work);
        cancel_work_sync(&ctrl->fatal_err_work);
 
-       ida_simple_remove(&subsys->cntlid_ida, ctrl->cntlid);
+       ida_simple_remove(&cntlid_ida, ctrl->cntlid);
        nvmet_subsys_put(subsys);
 
        kfree(ctrl->sqs);
@@ -918,9 +919,6 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
        mutex_init(&subsys->lock);
        INIT_LIST_HEAD(&subsys->namespaces);
        INIT_LIST_HEAD(&subsys->ctrls);
-
-       ida_init(&subsys->cntlid_ida);
-
        INIT_LIST_HEAD(&subsys->hosts);
 
        return subsys;
@@ -933,7 +931,6 @@ static void nvmet_subsys_free(struct kref *ref)
 
        WARN_ON_ONCE(!list_empty(&subsys->namespaces));
 
-       ida_destroy(&subsys->cntlid_ida);
        kfree(subsys->subsysnqn);
        kfree(subsys);
 }
@@ -976,6 +973,7 @@ static void __exit nvmet_exit(void)
 {
        nvmet_exit_configfs();
        nvmet_exit_discovery();
+       ida_destroy(&cntlid_ida);
 
        BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024);
        BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024);
index 12f39eea569f2fb33cec45884c188d0ad8ae2493..af8aabf0533504971bef38fbdd02c3943b805a5f 100644 (file)
@@ -186,14 +186,14 @@ int nvmet_parse_discovery_cmd(struct nvmet_req *req)
                }
        case nvme_admin_identify:
                req->data_len = 4096;
-               switch (le32_to_cpu(cmd->identify.cns)) {
+               switch (cmd->identify.cns) {
                case NVME_ID_CNS_CTRL:
                        req->execute =
                                nvmet_execute_identify_disc_ctrl;
                        return 0;
                default:
                        pr_err("nvmet: unsupported identify cns %d\n",
-                               le32_to_cpu(cmd->identify.cns));
+                               cmd->identify.cns);
                        return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
                }
        default:
index f4088198cd0d0a15b8b3da63f698612f568f2bb3..8bd022af3df6741ed1b08f10bb7b6dce40fe6925 100644 (file)
@@ -153,8 +153,8 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
                goto out;
        }
 
-       pr_info("creating controller %d for NQN %s.\n",
-                       ctrl->cntlid, ctrl->hostnqn);
+       pr_info("creating controller %d for subsystem %s for NQN %s.\n",
+               ctrl->cntlid, ctrl->subsys->subsysnqn, ctrl->hostnqn);
        req->rsp->result.u16 = cpu_to_le16(ctrl->cntlid);
 
 out:
@@ -220,7 +220,7 @@ int nvmet_parse_connect_cmd(struct nvmet_req *req)
 
        req->ns = NULL;
 
-       if (req->cmd->common.opcode != nvme_fabrics_command) {
+       if (cmd->common.opcode != nvme_fabrics_command) {
                pr_err("invalid command 0x%x on unconnected queue.\n",
                        cmd->fabrics.opcode);
                return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
index ba57f9852bde33b0ff3d0655d4c08313632a3a8f..8f483ee7868c56bdc55e174f226f368609b32159 100644 (file)
@@ -1817,16 +1817,14 @@ nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
                /* data no longer needed */
                nvmet_fc_free_tgt_pgs(fod);
 
-               if (fcpreq->fcp_error || abort)
-                       nvmet_req_complete(&fod->req, fcpreq->fcp_error);
-
+               nvmet_req_complete(&fod->req, fcpreq->fcp_error);
                return;
        }
 
        switch (fcpreq->op) {
 
        case NVMET_FCOP_WRITEDATA:
-               if (abort || fcpreq->fcp_error ||
+               if (fcpreq->fcp_error ||
                    fcpreq->transferred_length != fcpreq->transfer_length) {
                        nvmet_req_complete(&fod->req,
                                        NVME_SC_FC_TRANSPORT_ERROR);
@@ -1849,7 +1847,7 @@ nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
 
        case NVMET_FCOP_READDATA:
        case NVMET_FCOP_READDATA_RSP:
-               if (abort || fcpreq->fcp_error ||
+               if (fcpreq->fcp_error ||
                    fcpreq->transferred_length != fcpreq->transfer_length) {
                        /* data no longer needed */
                        nvmet_fc_free_tgt_pgs(fod);
index f3862e38f5748d8e21b4a55574cac30c7cf2054a..d1f06e7768ff1d7ff6ee787ff6d94eb01576252f 100644 (file)
@@ -724,8 +724,7 @@ static int __init nvme_loop_init_module(void)
        ret = nvmet_register_transport(&nvme_loop_ops);
        if (ret)
                return ret;
-       nvmf_register_transport(&nvme_loop_transport);
-       return 0;
+       return nvmf_register_transport(&nvme_loop_transport);
 }
 
 static void __exit nvme_loop_cleanup_module(void)
index cc7ad06b43a78a029dd76fac575c3f6ee57c9e92..1370eee0a3c0f6295722d22e0c103a2f6cece47b 100644 (file)
@@ -142,7 +142,6 @@ struct nvmet_subsys {
        unsigned int            max_nsid;
 
        struct list_head        ctrls;
-       struct ida              cntlid_ida;
 
        struct list_head        hosts;
        bool                    allow_any_host;
index 60990220bd831074bc3c8fdbd044ee3aed37a2db..9aa1da3778b3ac1d2262bfe9b845b65b9cd942d9 100644 (file)
@@ -1041,6 +1041,9 @@ static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id,
 {
        struct nvme_rdma_cm_rej rej;
 
+       pr_debug("rejecting connect request: status %d (%s)\n",
+                status, nvme_rdma_cm_msg(status));
+
        rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
        rej.sts = cpu_to_le16(status);
 
@@ -1091,7 +1094,7 @@ nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev,
        queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL);
        if (queue->idx < 0) {
                ret = NVME_RDMA_CM_NO_RSC;
-               goto out_free_queue;
+               goto out_destroy_sq;
        }
 
        ret = nvmet_rdma_alloc_rsps(queue);
@@ -1135,7 +1138,6 @@ out_destroy_sq:
 out_free_queue:
        kfree(queue);
 out_reject:
-       pr_debug("rejecting connect request with status code %d\n", ret);
        nvmet_rdma_cm_reject(cm_id, ret);
        return NULL;
 }
@@ -1188,7 +1190,6 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
 
        ndev = nvmet_rdma_find_get_device(cm_id);
        if (!ndev) {
-               pr_err("no client data!\n");
                nvmet_rdma_cm_reject(cm_id, NVME_RDMA_CM_NO_RSC);
                return -ECONNREFUSED;
        }
index 912fbc3b4543dd5e87b04341862293d46fd57f30..3e32dc954c3c8c6b05d5883615afc1b21864e401 100644 (file)
@@ -1167,7 +1167,7 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
 
        /* zero out the cmd, except for the embedded scsi_request */
        memset((char *)cmd + sizeof(cmd->req), 0,
-               sizeof(*cmd) - sizeof(cmd->req));
+               sizeof(*cmd) - sizeof(cmd->req) + dev->host->hostt->cmd_size);
 
        cmd->device = dev;
        cmd->sense_buffer = buf;
index 126a5ee00987ee14e21553b035620bff88302252..f94535130a344580884c350bb18baa1a9eee4cbe 100644 (file)
@@ -227,27 +227,31 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy)
                return 0;
        }
 
+       q = blk_alloc_queue(GFP_KERNEL);
+       if (!q)
+               return -ENOMEM;
+       q->cmd_size = sizeof(struct scsi_request);
+
        if (rphy) {
-               q = blk_init_queue(sas_non_host_smp_request, NULL);
+               q->request_fn = sas_non_host_smp_request;
                dev = &rphy->dev;
                name = dev_name(dev);
                release = NULL;
        } else {
-               q = blk_init_queue(sas_host_smp_request, NULL);
+               q->request_fn = sas_host_smp_request;
                dev = &shost->shost_gendev;
                snprintf(namebuf, sizeof(namebuf),
                         "sas_host%d", shost->host_no);
                name = namebuf;
                release = sas_host_release;
        }
-       if (!q)
-               return -ENOMEM;
+       error = blk_init_allocated_queue(q);
+       if (error)
+               goto out_cleanup_queue;
 
        error = bsg_register_queue(q, dev, name, release);
-       if (error) {
-               blk_cleanup_queue(q);
-               return -ENOMEM;
-       }
+       if (error)
+               goto out_cleanup_queue;
 
        if (rphy)
                rphy->q = q;
@@ -261,6 +265,10 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy)
 
        queue_flag_set_unlocked(QUEUE_FLAG_BIDI, q);
        return 0;
+
+out_cleanup_queue:
+       blk_cleanup_queue(q);
+       return error;
 }
 
 static void sas_bsg_remove(struct Scsi_Host *shost, struct sas_rphy *rphy)
index 73abd89c0108dc793159fe10d2567970de7d70ef..46e46894e918ab03c6e7821bb715332d276bf346 100644 (file)
@@ -116,7 +116,7 @@ static int receive_chars_getchar(struct uart_port *port)
 
 static int receive_chars_read(struct uart_port *port)
 {
-       int saw_console_brk = 0;
+       static int saw_console_brk;
        int limit = 10000;
 
        while (limit-- > 0) {
@@ -128,6 +128,9 @@ static int receive_chars_read(struct uart_port *port)
                        bytes_read = 0;
 
                        if (stat == CON_BREAK) {
+                               if (saw_console_brk)
+                                       sun_do_break();
+
                                if (uart_handle_break(port))
                                        continue;
                                saw_console_brk = 1;
@@ -151,6 +154,7 @@ static int receive_chars_read(struct uart_port *port)
                if (port->sysrq != 0 &&  *con_read_page) {
                        for (i = 0; i < bytes_read; i++)
                                uart_handle_sysrq_char(port, con_read_page[i]);
+                       saw_console_brk = 0;
                }
 
                if (port->state == NULL)
@@ -398,6 +402,12 @@ static struct uart_driver sunhv_reg = {
 
 static struct uart_port *sunhv_port;
 
+void sunhv_migrate_hvcons_irq(int cpu)
+{
+       /* Migrate hvcons irq to param cpu */
+       irq_force_affinity(sunhv_port->irq, cpumask_of(cpu));
+}
+
 /* Copy 's' into the con_write_page, decoding "\n" into
  * "\r\n" along the way.  We have to return two lengths
  * because the caller needs to know how much to advance
index 73031ec54a7be5ff34be619581cb20cef81cb8f5..1c62845a72c71acd3e2a653dbfd580588c61dec7 100644 (file)
@@ -1043,13 +1043,22 @@ static struct block_device *bd_acquire(struct inode *inode)
 
        spin_lock(&bdev_lock);
        bdev = inode->i_bdev;
-       if (bdev) {
+       if (bdev && !inode_unhashed(bdev->bd_inode)) {
                bdgrab(bdev);
                spin_unlock(&bdev_lock);
                return bdev;
        }
        spin_unlock(&bdev_lock);
 
+       /*
+        * i_bdev references block device inode that was already shut down
+        * (corresponding device got removed).  Remove the reference and look
+        * up block device inode again just in case new device got
+        * reestablished under the same device number.
+        */
+       if (bdev)
+               bd_forget(inode);
+
        bdev = bdget(inode->i_rdev);
        if (bdev) {
                spin_lock(&bdev_lock);
index 9ed8b987185b45b1157993abf4b0fe5a6c0b23a8..3f38eb03649c93873c678677964425a1daf09c26 100644 (file)
@@ -223,6 +223,7 @@ static inline void atomic_dec(atomic_t *v)
 #define atomic_xchg(ptr, v)            (xchg(&(ptr)->counter, (v)))
 #define atomic_cmpxchg(v, old, new)    (cmpxchg(&((v)->counter), (old), (new)))
 
+#ifndef __atomic_add_unless
 static inline int __atomic_add_unless(atomic_t *v, int a, int u)
 {
        int c, old;
@@ -231,5 +232,6 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
                c = old;
        return c;
 }
+#endif
 
 #endif /* __ASM_GENERIC_ATOMIC_H */
index 7cf8a6c70a3f71c5eca6100dfda8d67f10b20e3b..8e521194f6fc4ad32138a51c962a365c74debaed 100644 (file)
@@ -183,7 +183,7 @@ static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter,
 
 #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len)
 
-static inline unsigned bio_segments(struct bio *bio)
+static inline unsigned __bio_segments(struct bio *bio, struct bvec_iter *bvec)
 {
        unsigned segs = 0;
        struct bio_vec bv;
@@ -205,12 +205,17 @@ static inline unsigned bio_segments(struct bio *bio)
                break;
        }
 
-       bio_for_each_segment(bv, bio, iter)
+       __bio_for_each_segment(bv, bio, iter, *bvec)
                segs++;
 
        return segs;
 }
 
+static inline unsigned bio_segments(struct bio *bio)
+{
+       return __bio_segments(bio, &bio->bi_iter);
+}
+
 /*
  * get a reference to a bio, so it won't disappear. the intended use is
  * something like:
@@ -384,6 +389,8 @@ extern void bio_put(struct bio *);
 extern void __bio_clone_fast(struct bio *, struct bio *);
 extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
 extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
+extern struct bio *bio_clone_bioset_partial(struct bio *, gfp_t,
+                                           struct bio_set *, int, int);
 
 extern struct bio_set *fs_bio_set;
 
index 8e4df3d6c8cd9dbd1ddd96e25e1abae9a4aa4b6f..001d30d727c56c4d46e5e572ded575ca50cc85ff 100644 (file)
@@ -33,6 +33,7 @@ struct blk_mq_hw_ctx {
        struct blk_mq_ctx       **ctxs;
        unsigned int            nr_ctx;
 
+       wait_queue_t            dispatch_wait;
        atomic_t                wait_index;
 
        struct blk_mq_tags      *tags;
@@ -160,6 +161,7 @@ enum {
        BLK_MQ_S_STOPPED        = 0,
        BLK_MQ_S_TAG_ACTIVE     = 1,
        BLK_MQ_S_SCHED_RESTART  = 2,
+       BLK_MQ_S_TAG_WAITING    = 3,
 
        BLK_MQ_MAX_DEPTH        = 10240,
 
index bf240a3cbf9990835e3e3106eff701ec86f8c73e..a72fd04aa5e1eac103dd94165edece813663b456 100644 (file)
@@ -29,6 +29,30 @@ enum nvme_rdma_cm_status {
        NVME_RDMA_CM_INVALID_ORD        = 0x08,
 };
 
+static inline const char *nvme_rdma_cm_msg(enum nvme_rdma_cm_status status)
+{
+       switch (status) {
+       case NVME_RDMA_CM_INVALID_LEN:
+               return "invalid length";
+       case NVME_RDMA_CM_INVALID_RECFMT:
+               return "invalid record format";
+       case NVME_RDMA_CM_INVALID_QID:
+               return "invalid queue ID";
+       case NVME_RDMA_CM_INVALID_HSQSIZE:
+               return "invalid host SQ size";
+       case NVME_RDMA_CM_INVALID_HRQSIZE:
+               return "invalid host RQ size";
+       case NVME_RDMA_CM_NO_RSC:
+               return "resource not found";
+       case NVME_RDMA_CM_INVALID_IRD:
+               return "invalid IRD";
+       case NVME_RDMA_CM_INVALID_ORD:
+               return "Invalid ORD";
+       default:
+               return "unrecognized reason";
+       }
+}
+
 /**
  * struct nvme_rdma_cm_req - rdma connect request
  *
index 0b676a02cf3e0899cb27c068bb6d3c1225a2506a..c43d435d422552d029bd569157d9aa352348747f 100644 (file)
@@ -579,6 +579,12 @@ struct nvme_write_zeroes_cmd {
        __le16                  appmask;
 };
 
+/* Features */
+
+struct nvme_feat_auto_pst {
+       __le64 entries[32];
+};
+
 /* Admin commands */
 
 enum nvme_admin_opcode {
@@ -644,7 +650,9 @@ struct nvme_identify {
        __le32                  nsid;
        __u64                   rsvd2[2];
        union nvme_data_ptr     dptr;
-       __le32                  cns;
+       __u8                    cns;
+       __u8                    rsvd3;
+       __le16                  ctrlid;
        __u32                   rsvd11[5];
 };
 
index deee23d012e7f8f6864845040b5a9a4586689809..04b124fca51e36e635c7039fad0cd7d9741df253 100644 (file)
@@ -27,6 +27,7 @@ typedef int (sec_send_recv)(void *data, u16 spsp, u8 secp, void *buffer,
                size_t len, bool send);
 
 #ifdef CONFIG_BLK_SED_OPAL
+void free_opal_dev(struct opal_dev *dev);
 bool opal_unlock_from_suspend(struct opal_dev *dev);
 struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv);
 int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *ioctl_ptr);
@@ -51,6 +52,10 @@ static inline bool is_sed_ioctl(unsigned int cmd)
        return false;
 }
 #else
+static inline void free_opal_dev(struct opal_dev *dev)
+{
+}
+
 static inline bool is_sed_ioctl(unsigned int cmd)
 {
        return false;
index b95959733ce08a8723e98535153fbb74e495ad7b..3ec16e603e88281eb3e8596a99f10a30af1b7990 100644 (file)
@@ -273,7 +273,8 @@ void panic(const char *fmt, ...)
                extern int stop_a_enabled;
                /* Make sure the user can actually press Stop-A (L1-A) */
                stop_a_enabled = 1;
-               pr_emerg("Press Stop-A (L1-A) to return to the boot prom\n");
+               pr_emerg("Press Stop-A (L1-A) from sun keyboard or send break\n"
+                        "twice on console to return to the boot prom\n");
        }
 #endif
 #if defined(CONFIG_S390)
index 84812a9fb16fbbd1409315ea3752fb9a1e3e39ef..72fab4999c00662a187536ee66c6084eb69a8b11 100644 (file)
@@ -1102,6 +1102,7 @@ void radix_tree_replace_slot(struct radix_tree_root *root,
 {
        replace_slot(root, NULL, slot, item, true);
 }
+EXPORT_SYMBOL(radix_tree_replace_slot);
 
 /**
  * radix_tree_iter_replace - replace item in a slot
index 694a075381b0dc781ad76062594bb21464da6bed..3033be701e9a24c5c8c090ce6f453df070e8cb23 100755 (executable)
@@ -81,6 +81,9 @@ my (@stack, $re, $dre, $x, $xs, $funcre);
        } elsif ($arch eq 'nios2') {
                #25a8:  defffb04        addi    sp,sp,-20
                $re = qr/.*addi.*sp,sp,-(([0-9]{2}|[3-9])[0-9]{2})/o;
+       } elsif ($arch eq 'openrisc') {
+               # c000043c:       9c 21 fe f0     l.addi r1,r1,-272
+               $re = qr/.*l\.addi.*r1,r1,-(([0-9]{2}|[3-9])[0-9]{2})/o;
        } elsif ($arch eq 'parisc' || $arch eq 'parisc64') {
                $re = qr/.*ldo ($x{1,8})\(sp\),sp/o;
        } elsif ($arch eq 'ppc') {