]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/commitdiff
Merge tag 'iommu-fix-v4.11-rc0-2' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 1 Mar 2017 04:18:44 +0000 (20:18 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 1 Mar 2017 04:18:44 +0000 (20:18 -0800)
Pull IOMMU fixes from Joerg Roedel:
 "Fix an issue introduced this merge window into the AMD and Intel IOMMU
  drivers that causes an oops when the vendor-specific sysfs-entries are
  accessed.

  The reason for this issue is that I forgot to update the sysfs code in
  the drivers when moving the iommu 'struct device' to the iommu-core"

* tag 'iommu-fix-v4.11-rc0-2' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu:
  iommu/amd: Fix crash when accessing AMD-Vi sysfs entries
  iommu/vt-d: Fix crash when accessing VT-d sysfs entries

256 files changed:
Documentation/filesystems/ceph.txt
Documentation/x86/intel_rdt_ui.txt
Makefile
arch/arm/Kconfig
arch/arm/Kconfig-nommu
arch/arm/boot/compressed/decompress.c
arch/arm/common/mcpm_entry.c
arch/arm/include/asm/hardware/cache-uniphier.h
arch/arm/include/asm/memory.h
arch/arm/include/asm/pgtable-nommu.h
arch/arm/kernel/head-nommu.S
arch/arm/kernel/module.c
arch/arm/kernel/setup.c
arch/arm/kernel/smp.c
arch/arm/mach-alpine/platsmp.c
arch/arm/mach-axxia/platsmp.c
arch/arm/mach-bcm/bcm63xx_smp.c
arch/arm/mach-bcm/platsmp-brcmstb.c
arch/arm/mach-bcm/platsmp.c
arch/arm/mach-berlin/platsmp.c
arch/arm/mach-exynos/firmware.c
arch/arm/mach-exynos/mcpm-exynos.c
arch/arm/mach-exynos/platsmp.c
arch/arm/mach-exynos/pm.c
arch/arm/mach-exynos/suspend.c
arch/arm/mach-hisi/platmcpm.c
arch/arm/mach-hisi/platsmp.c
arch/arm/mach-imx/platsmp.c
arch/arm/mach-imx/pm-imx6.c
arch/arm/mach-imx/src.c
arch/arm/mach-mediatek/platsmp.c
arch/arm/mach-mvebu/pm.c
arch/arm/mach-mvebu/pmsu.c
arch/arm/mach-mvebu/system-controller.c
arch/arm/mach-omap2/control.c
arch/arm/mach-omap2/omap-mpuss-lowpower.c
arch/arm/mach-omap2/omap-smp.c
arch/arm/mach-prima2/platsmp.c
arch/arm/mach-prima2/pm.c
arch/arm/mach-pxa/palmz72.c
arch/arm/mach-pxa/pxa25x.c
arch/arm/mach-pxa/pxa27x.c
arch/arm/mach-pxa/pxa3xx.c
arch/arm/mach-realview/platsmp-dt.c
arch/arm/mach-rockchip/platsmp.c
arch/arm/mach-rockchip/pm.c
arch/arm/mach-s3c24xx/mach-jive.c
arch/arm/mach-s3c24xx/pm-s3c2410.c
arch/arm/mach-s3c24xx/pm-s3c2416.c
arch/arm/mach-s3c64xx/pm.c
arch/arm/mach-s5pv210/pm.c
arch/arm/mach-sa1100/pm.c
arch/arm/mach-shmobile/platsmp-apmu.c
arch/arm/mach-shmobile/platsmp-scu.c
arch/arm/mach-socfpga/platsmp.c
arch/arm/mach-spear/platsmp.c
arch/arm/mach-sti/platsmp.c
arch/arm/mach-sunxi/platsmp.c
arch/arm/mach-tango/platsmp.c
arch/arm/mach-tango/pm.c
arch/arm/mach-tegra/reset.c
arch/arm/mach-ux500/platsmp.c
arch/arm/mach-vexpress/dcscb.c
arch/arm/mach-vexpress/platsmp.c
arch/arm/mach-vexpress/tc2_pm.c
arch/arm/mach-zx/platsmp.c
arch/arm/mach-zynq/platsmp.c
arch/arm/mm/Kconfig
arch/arm/mm/Makefile
arch/arm/mm/cache-uniphier.c
arch/arm/mm/dma-mapping.c
arch/arm/mm/dump.c
arch/arm/mm/flush.c
arch/arm/mm/init.c
arch/arm/mm/mmu.c
arch/arm/mm/nommu.c
arch/arm/mm/physaddr.c [new file with mode: 0644]
arch/x86/kernel/apic/apic.c
arch/x86/kernel/apic/vector.c
arch/x86/kernel/cpu/mcheck/mce_amd.c
arch/x86/kernel/cpu/mcheck/therm_throt.c
arch/x86/kernel/cpu/mcheck/threshold.c
arch/x86/kernel/irq.c
arch/x86/kernel/irq_work.c
arch/x86/kernel/setup.c
arch/x86/kernel/smp.c
arch/x86/kernel/vmlinux.lds.S
drivers/block/rbd.c
drivers/block/rbd_types.h
drivers/ide/palm_bk3710.c
drivers/mtd/devices/lart.c
drivers/net/ethernet/apm/xgene/xgene_enet_main.c
drivers/net/ethernet/mellanox/mlx4/en_clock.c
drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
drivers/net/ethernet/neterion/s2io.c
drivers/net/ethernet/neterion/vxge/vxge-ethtool.c
drivers/net/ethernet/qlogic/qed/qed.h
drivers/net/ethernet/qlogic/qed/qed_dev.c
drivers/net/ethernet/qlogic/qed/qed_mcp.c
drivers/net/ethernet/qlogic/qed/qed_sriov.c
drivers/net/ethernet/qlogic/qed/qed_sriov.h
drivers/net/ethernet/stmicro/stmmac/common.h
drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c
drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c
drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
drivers/net/phy/phy.c
drivers/net/vxlan.c
fs/ceph/addr.c
fs/ceph/cache.c
fs/ceph/caps.c
fs/ceph/debugfs.c
fs/ceph/dir.c
fs/ceph/export.c
fs/ceph/file.c
fs/ceph/inode.c
fs/ceph/ioctl.c
fs/ceph/mds_client.c
fs/ceph/mds_client.h
fs/ceph/super.c
fs/ceph/super.h
fs/lockd/svc.c
fs/nfs/callback_xdr.c
fs/nfsd/export.c
fs/nfsd/nfs2acl.c
fs/nfsd/nfs3acl.c
fs/nfsd/nfs3proc.c
fs/nfsd/nfs4callback.c
fs/nfsd/nfs4idmap.c
fs/nfsd/nfs4proc.c
fs/nfsd/nfs4state.c
fs/nfsd/nfs4xdr.c
fs/nfsd/nfsctl.c
fs/nfsd/nfsd.h
fs/nfsd/nfsproc.c
fs/nfsd/nfssvc.c
fs/nfsd/state.h
fs/nfsd/vfs.c
fs/nfsd/vfs.h
include/linux/ceph/osd_client.h
include/linux/ceph/osdmap.h
include/linux/ceph/rados.h
include/linux/compiler-gcc.h
include/linux/crush/crush.h
include/linux/crush/mapper.h
include/linux/refcount.h
include/linux/sunrpc/cache.h
include/linux/sunrpc/rpc_rdma.h
include/linux/sunrpc/svc.h
include/linux/sunrpc/svc_rdma.h
include/linux/sunrpc/svc_xprt.h
include/uapi/linux/netfilter.h
include/uapi/linux/netfilter/xt_hashlimit.h
include/uapi/linux/nfsd/export.h
kernel/events/core.c
kernel/sched/core.c
lib/Kconfig
lib/Kconfig.debug
lib/Makefile
lib/refcount.c [new file with mode: 0644]
lib/rhashtable.c
lib/test_parman.c
net/ceph/cls_lock_client.c
net/ceph/crush/crush.c
net/ceph/crush/mapper.c
net/ceph/crypto.c
net/ceph/osd_client.c
net/ceph/osdmap.c
net/ceph/snapshot.c
net/ipv4/fib_frontend.c
net/ipv4/route.c
net/ipv6/ip6_vti.c
net/ipv6/ip6mr.c
net/l2tp/l2tp_ip.c
net/netfilter/nf_conntrack_expect.c
net/netfilter/nft_ct.c
net/netfilter/nft_set_bitmap.c
net/rds/ib.c
net/rds/tcp.c
net/rxrpc/key.c
net/rxrpc/recvmsg.c
net/sched/act_api.c
net/sctp/protocol.c
net/sctp/socket.c
net/sunrpc/auth_gss/svcauth_gss.c
net/sunrpc/cache.c
net/sunrpc/svc.c
net/sunrpc/svcsock.c
net/sunrpc/xprtrdma/svc_rdma_backchannel.c
net/sunrpc/xprtrdma/svc_rdma_marshal.c
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
net/sunrpc/xprtrdma/svc_rdma_sendto.c
net/sunrpc/xprtrdma/svc_rdma_transport.c
net/tipc/node.c
net/xfrm/xfrm_policy.c
tools/build/Makefile
tools/build/Makefile.include
tools/lib/traceevent/event-parse.c
tools/lib/traceevent/event-parse.h
tools/objtool/arch.h
tools/objtool/arch/x86/decode.c
tools/objtool/builtin-check.c
tools/perf/Documentation/perf-annotate.txt
tools/perf/Documentation/perf-diff.txt
tools/perf/Documentation/perf-record.txt
tools/perf/Documentation/perf-report.txt
tools/perf/Documentation/perf-stat.txt
tools/perf/Makefile.config
tools/perf/Makefile.perf
tools/perf/builtin-annotate.c
tools/perf/builtin-diff.c
tools/perf/builtin-mem.c
tools/perf/builtin-record.c
tools/perf/builtin-report.c
tools/perf/builtin-sched.c
tools/perf/builtin-stat.c
tools/perf/builtin-top.c
tools/perf/builtin-trace.c
tools/perf/pmu-events/json.c
tools/perf/tests/attr.c
tools/perf/tests/builtin-test.c
tools/perf/tests/code-reading.c
tools/perf/tests/fdarray.c
tools/perf/tests/llvm.c
tools/perf/tests/parse-events.c
tools/perf/tests/perf-record.c
tools/perf/tests/python-use.c
tools/perf/tests/thread-map.c
tools/perf/tests/topology.c
tools/perf/tests/vmlinux-kallsyms.c
tools/perf/ui/browsers/map.c
tools/perf/ui/hist.c
tools/perf/util/annotate.c
tools/perf/util/cpumap.c
tools/perf/util/cpumap.h
tools/perf/util/debug.c
tools/perf/util/debug.h
tools/perf/util/dso.c
tools/perf/util/env.c
tools/perf/util/header.c
tools/perf/util/hist.c
tools/perf/util/parse-events.c
tools/perf/util/parse-events.h
tools/perf/util/parse-events.y
tools/perf/util/pmu.c
tools/perf/util/probe-event.c
tools/perf/util/scripting-engines/trace-event-python.c
tools/perf/util/session.c
tools/perf/util/setup.py
tools/perf/util/sort.c
tools/perf/util/stat.c
tools/perf/util/symbol-elf.c

index f5306ee40ea98216602d2214e8944a6aa11edadf..0b302a11718a43fd7ed44725390a5a4ceacb2229 100644 (file)
@@ -98,11 +98,10 @@ Mount Options
        size.
 
   rsize=X
-       Specify the maximum read size in bytes.  By default there is no
-       maximum.
+       Specify the maximum read size in bytes.  Default: 64 MB.
 
   rasize=X
-       Specify the maximum readahead.
+       Specify the maximum readahead.  Default: 8 MB.
 
   mount_timeout=X
        Specify the timeout value for mount (in seconds), in the case
index d918d268cd72bd6e8c7ec839dc9235e75233a72b..51cf6fa5591f9e99d95e73212e19372821ae4d7f 100644 (file)
@@ -212,3 +212,117 @@ Finally we move core 4-7 over to the new group and make sure that the
 kernel and the tasks running there get 50% of the cache.
 
 # echo C0 > p0/cpus
+
+4) Locking between applications
+
+Certain operations on the resctrl filesystem, composed of read/writes
+to/from multiple files, must be atomic.
+
+As an example, the allocation of an exclusive reservation of L3 cache
+involves:
+
+  1. Read the cbmmasks from each directory
+  2. Find a contiguous set of bits in the global CBM bitmask that is clear
+     in any of the directory cbmmasks
+  3. Create a new directory
+  4. Set the bits found in step 2 to the new directory "schemata" file
+
+If two applications attempt to allocate space concurrently then they can
+end up allocating the same bits so the reservations are shared instead of
+exclusive.
+
+To coordinate atomic operations on the resctrlfs and to avoid the problem
+above, the following locking procedure is recommended:
+
+Locking is based on flock, which is available in libc and also as a shell
+script command
+
+Write lock:
+
+ A) Take flock(LOCK_EX) on /sys/fs/resctrl
+ B) Read/write the directory structure.
+ C) funlock
+
+Read lock:
+
+ A) Take flock(LOCK_SH) on /sys/fs/resctrl
+ B) If success read the directory structure.
+ C) funlock
+
+Example with bash:
+
+# Atomically read directory structure
+$ flock -s /sys/fs/resctrl/ find /sys/fs/resctrl
+
+# Read directory contents and create new subdirectory
+
+$ cat create-dir.sh
+find /sys/fs/resctrl/ > output.txt
+mask = function-of(output.txt)
+mkdir /sys/fs/resctrl/newres/
+echo mask > /sys/fs/resctrl/newres/schemata
+
+$ flock /sys/fs/resctrl/ ./create-dir.sh
+
+Example with C:
+
+/*
+ * Example code do take advisory locks
+ * before accessing resctrl filesystem
+ */
+#include <sys/file.h>
+#include <stdlib.h>
+
+void resctrl_take_shared_lock(int fd)
+{
+       int ret;
+
+       /* take shared lock on resctrl filesystem */
+       ret = flock(fd, LOCK_SH);
+       if (ret) {
+               perror("flock");
+               exit(-1);
+       }
+}
+
+void resctrl_take_exclusive_lock(int fd)
+{
+       int ret;
+
+       /* release lock on resctrl filesystem */
+       ret = flock(fd, LOCK_EX);
+       if (ret) {
+               perror("flock");
+               exit(-1);
+       }
+}
+
+void resctrl_release_lock(int fd)
+{
+       int ret;
+
+       /* take shared lock on resctrl filesystem */
+       ret = flock(fd, LOCK_UN);
+       if (ret) {
+               perror("flock");
+               exit(-1);
+       }
+}
+
+void main(void)
+{
+       int fd, ret;
+
+       fd = open("/sys/fs/resctrl", O_DIRECTORY);
+       if (fd == -1) {
+               perror("open");
+               exit(-1);
+       }
+       resctrl_take_shared_lock(fd);
+       /* code to read directory contents */
+       resctrl_release_lock(fd);
+
+       resctrl_take_exclusive_lock(fd);
+       /* code to read and write directory contents */
+       resctrl_release_lock(fd);
+}
index b83109b5d217cc2652aba6cf99c6b4f92a5b8cea..4cb6b0a1152b5f57f783f0afa64207e956112c42 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -910,6 +910,18 @@ mod_sign_cmd = true
 endif
 export mod_sign_cmd
 
+ifdef CONFIG_STACK_VALIDATION
+  has_libelf := $(call try-run,\
+               echo "int main() {}" | $(HOSTCC) -xc -o /dev/null -lelf -,1,0)
+  ifeq ($(has_libelf),1)
+    objtool_target := tools/objtool FORCE
+  else
+    $(warning "Cannot use CONFIG_STACK_VALIDATION, please install libelf-dev, libelf-devel or elfutils-libelf-devel")
+    SKIP_STACK_VALIDATION := 1
+    export SKIP_STACK_VALIDATION
+  endif
+endif
+
 
 ifeq ($(KBUILD_EXTMOD),)
 core-y         += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/
@@ -1037,18 +1049,6 @@ prepare0: archprepare gcc-plugins
 # All the preparing..
 prepare: prepare0 prepare-objtool
 
-ifdef CONFIG_STACK_VALIDATION
-  has_libelf := $(call try-run,\
-               echo "int main() {}" | $(HOSTCC) -xc -o /dev/null -lelf -,1,0)
-  ifeq ($(has_libelf),1)
-    objtool_target := tools/objtool FORCE
-  else
-    $(warning "Cannot use CONFIG_STACK_VALIDATION, please install libelf-dev, libelf-devel or elfutils-libelf-devel")
-    SKIP_STACK_VALIDATION := 1
-    export SKIP_STACK_VALIDATION
-  endif
-endif
-
 PHONY += prepare-objtool
 prepare-objtool: $(objtool_target)
 
index fda6a46d27cfe2bda02257a8a8b30d316cd14612..0d4e71b42c77da986a2dc471a1f335be5e958527 100644 (file)
@@ -2,6 +2,7 @@ config ARM
        bool
        default y
        select ARCH_CLOCKSOURCE_DATA
+       select ARCH_HAS_DEBUG_VIRTUAL
        select ARCH_HAS_DEVMEM_IS_ALLOWED
        select ARCH_HAS_ELF_RANDOMIZE
        select ARCH_HAS_SET_MEMORY
index aed66d5df7f1546ac48cf93ee9c4f2aeb91d5b76..b7576349528c12a8ef7533b6b66bc9a68b8dc9f9 100644 (file)
@@ -34,8 +34,7 @@ config PROCESSOR_ID
          used instead of the auto-probing which utilizes the register.
 
 config REMAP_VECTORS_TO_RAM
-       bool 'Install vectors to the beginning of RAM' if DRAM_BASE
-       depends on DRAM_BASE
+       bool 'Install vectors to the beginning of RAM'
        help
          The kernel needs to change the hardware exception vectors.
          In nommu mode, the hardware exception vectors are normally
index a0765e7ed6c7dd2166b2cb95874fc076532d526c..ea7832702a8f4473ffbc90e5c1c58271fab1dd9a 100644 (file)
@@ -32,6 +32,7 @@ extern void error(char *);
 
 /* Not needed, but used in some headers pulled in by decompressors */
 extern char * strstr(const char * s1, const char *s2);
+extern size_t strlen(const char *s);
 
 #ifdef CONFIG_KERNEL_GZIP
 #include "../../../../lib/decompress_inflate.c"
index a923524d1040734d1f5b94b4a6e2f5d59cab99d1..cf062472e07bcb4be470bf35ab029df3438dbc7e 100644 (file)
@@ -144,7 +144,7 @@ extern unsigned long mcpm_entry_vectors[MAX_NR_CLUSTERS][MAX_CPUS_PER_CLUSTER];
 
 void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr)
 {
-       unsigned long val = ptr ? virt_to_phys(ptr) : 0;
+       unsigned long val = ptr ? __pa_symbol(ptr) : 0;
        mcpm_entry_vectors[cluster][cpu] = val;
        sync_cache_w(&mcpm_entry_vectors[cluster][cpu]);
 }
@@ -299,8 +299,8 @@ void mcpm_cpu_power_down(void)
         * the kernel as if the power_up method just had deasserted reset
         * on the CPU.
         */
-       phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset);
-       phys_reset(virt_to_phys(mcpm_entry_point));
+       phys_reset = (phys_reset_t)(unsigned long)__pa_symbol(cpu_reset);
+       phys_reset(__pa_symbol(mcpm_entry_point));
 
        /* should never get here */
        BUG();
@@ -388,8 +388,8 @@ static int __init nocache_trampoline(unsigned long _arg)
        __mcpm_outbound_leave_critical(cluster, CLUSTER_DOWN);
        __mcpm_cpu_down(cpu, cluster);
 
-       phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset);
-       phys_reset(virt_to_phys(mcpm_entry_point));
+       phys_reset = (phys_reset_t)(unsigned long)__pa_symbol(cpu_reset);
+       phys_reset(__pa_symbol(mcpm_entry_point));
        BUG();
 }
 
@@ -449,7 +449,7 @@ int __init mcpm_sync_init(
        sync_cache_w(&mcpm_sync);
 
        if (power_up_setup) {
-               mcpm_power_up_setup_phys = virt_to_phys(power_up_setup);
+               mcpm_power_up_setup_phys = __pa_symbol(power_up_setup);
                sync_cache_w(&mcpm_power_up_setup_phys);
        }
 
index eaa60da7dac31fc8977776d9929dde934ccd52b6..0ef42ae75b6ca992d2e13a7a9725e4dcbda89439 100644 (file)
@@ -16,7 +16,7 @@
 #ifndef __CACHE_UNIPHIER_H
 #define __CACHE_UNIPHIER_H
 
-#include <linux/types.h>
+#include <linux/errno.h>
 
 #ifdef CONFIG_CACHE_UNIPHIER
 int uniphier_cache_init(void);
index 76cbd9c674dff7815e7f25a293a8b03e80e3b442..1f54e4e98c1e10cd77fd23a465268946c33d15c8 100644 (file)
 #define IOREMAP_MAX_ORDER      24
 #endif
 
+#define VECTORS_BASE           UL(0xffff0000)
+
 #else /* CONFIG_MMU */
 
+#ifndef __ASSEMBLY__
+extern unsigned long vectors_base;
+#define VECTORS_BASE           vectors_base
+#endif
+
 /*
  * The limitation of user task size can grow up to the end of free ram region.
  * It is difficult to define and perhaps will never meet the original meaning
 
 #endif /* !CONFIG_MMU */
 
+#ifdef CONFIG_XIP_KERNEL
+#define KERNEL_START           _sdata
+#else
+#define KERNEL_START           _stext
+#endif
+#define KERNEL_END             _end
+
 /*
  * We fix the TCM memories max 32 KiB ITCM resp DTCM at these
  * locations
@@ -206,7 +220,7 @@ extern const void *__pv_table_begin, *__pv_table_end;
        : "r" (x), "I" (__PV_BITS_31_24)                \
        : "cc")
 
-static inline phys_addr_t __virt_to_phys(unsigned long x)
+static inline phys_addr_t __virt_to_phys_nodebug(unsigned long x)
 {
        phys_addr_t t;
 
@@ -238,7 +252,7 @@ static inline unsigned long __phys_to_virt(phys_addr_t x)
 #define PHYS_OFFSET    PLAT_PHYS_OFFSET
 #define PHYS_PFN_OFFSET        ((unsigned long)(PHYS_OFFSET >> PAGE_SHIFT))
 
-static inline phys_addr_t __virt_to_phys(unsigned long x)
+static inline phys_addr_t __virt_to_phys_nodebug(unsigned long x)
 {
        return (phys_addr_t)x - PAGE_OFFSET + PHYS_OFFSET;
 }
@@ -254,6 +268,16 @@ static inline unsigned long __phys_to_virt(phys_addr_t x)
        ((((unsigned long)(kaddr) - PAGE_OFFSET) >> PAGE_SHIFT) + \
         PHYS_PFN_OFFSET)
 
+#define __pa_symbol_nodebug(x) __virt_to_phys_nodebug((x))
+
+#ifdef CONFIG_DEBUG_VIRTUAL
+extern phys_addr_t __virt_to_phys(unsigned long x);
+extern phys_addr_t __phys_addr_symbol(unsigned long x);
+#else
+#define __virt_to_phys(x)      __virt_to_phys_nodebug(x)
+#define __phys_addr_symbol(x)  __pa_symbol_nodebug(x)
+#endif
+
 /*
  * These are *only* valid on the kernel direct mapped RAM memory.
  * Note: Drivers should NOT use these.  They are the wrong
@@ -276,6 +300,7 @@ static inline void *phys_to_virt(phys_addr_t x)
  * Drivers should NOT use these either.
  */
 #define __pa(x)                        __virt_to_phys((unsigned long)(x))
+#define __pa_symbol(x)         __phys_addr_symbol(RELOC_HIDE((unsigned long)(x), 0))
 #define __va(x)                        ((void *)__phys_to_virt((phys_addr_t)(x)))
 #define pfn_to_kaddr(pfn)      __va((phys_addr_t)(pfn) << PAGE_SHIFT)
 
index add094d09e3e2cec57c85cfc13349655cb93b9c1..302240c19a5aa688e7bdab1ece506dfbeaccea4e 100644 (file)
@@ -63,9 +63,9 @@ typedef pte_t *pte_addr_t;
 /*
  * Mark the prot value as uncacheable and unbufferable.
  */
-#define pgprot_noncached(prot) __pgprot(0)
-#define pgprot_writecombine(prot) __pgprot(0)
-#define pgprot_dmacoherent(prot) __pgprot(0)
+#define pgprot_noncached(prot) (prot)
+#define pgprot_writecombine(prot) (prot)
+#define pgprot_dmacoherent(prot) (prot)
 
 
 /*
index 6b4eb27b875863ffeb37a131fe445ee38f28a0f4..2e21e08de7478b5e19f1aee89521356e52d8c041 100644 (file)
@@ -151,11 +151,6 @@ __after_proc_init:
 #endif
 #ifdef CONFIG_CPU_ICACHE_DISABLE
        bic     r0, r0, #CR_I
-#endif
-#ifdef CONFIG_CPU_HIGH_VECTOR
-       orr     r0, r0, #CR_V
-#else
-       bic     r0, r0, #CR_V
 #endif
        mcr     p15, 0, r0, c1, c0, 0           @ write control reg
 #elif defined (CONFIG_CPU_V7M)
index 4f14b5ce6535f7a19215660ebb4b3e62bd6ea5ed..80254b47dc3420ec11cb6611f645d5b1faf55b66 100644 (file)
@@ -155,8 +155,17 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
                       break;
 
                case R_ARM_PREL31:
-                       offset = *(u32 *)loc + sym->st_value - loc;
-                       *(u32 *)loc = offset & 0x7fffffff;
+                       offset = (*(s32 *)loc << 1) >> 1; /* sign extend */
+                       offset += sym->st_value - loc;
+                       if (offset >= 0x40000000 || offset < -0x40000000) {
+                               pr_err("%s: section %u reloc %u sym '%s': relocation %u out of range (%#lx -> %#x)\n",
+                                      module->name, relindex, i, symname,
+                                      ELF32_R_TYPE(rel->r_info), loc,
+                                      sym->st_value);
+                               return -ENOEXEC;
+                       }
+                       *(u32 *)loc &= 0x80000000;
+                       *(u32 *)loc |= offset & 0x7fffffff;
                        break;
 
                case R_ARM_MOVW_ABS_NC:
index 34e3f3c45634d96182cd94f2672375dd5de32643..f4e54503afa9587106fad19c0932273f81995993 100644 (file)
@@ -81,7 +81,7 @@ __setup("fpe=", fpe_setup);
 extern void init_default_cache_policy(unsigned long);
 extern void paging_init(const struct machine_desc *desc);
 extern void early_paging_init(const struct machine_desc *);
-extern void sanity_check_meminfo(void);
+extern void adjust_lowmem_bounds(void);
 extern enum reboot_mode reboot_mode;
 extern void setup_dma_zone(const struct machine_desc *desc);
 
@@ -1093,8 +1093,14 @@ void __init setup_arch(char **cmdline_p)
        setup_dma_zone(mdesc);
        xen_early_init();
        efi_init();
-       sanity_check_meminfo();
+       /*
+        * Make sure the calculation for lowmem/highmem is set appropriately
+        * before reserving/allocating any mmeory
+        */
+       adjust_lowmem_bounds();
        arm_memblock_init(mdesc);
+       /* Memory may have been removed so recalculate the bounds. */
+       adjust_lowmem_bounds();
 
        early_ioremap_reset();
 
index c6514ce0fcbc870893f2d2e9fc49044afbf7588f..5a07c5a4b8943c68487cc8669eee0c20408670c2 100644 (file)
@@ -251,7 +251,7 @@ void __cpu_die(unsigned int cpu)
                pr_err("CPU%u: cpu didn't die\n", cpu);
                return;
        }
-       pr_notice("CPU%u: shutdown\n", cpu);
+       pr_debug("CPU%u: shutdown\n", cpu);
 
        /*
         * platform_cpu_kill() is generally expected to do the powering off
index dd77ea25e7ca9f6ff0cdb215fd5d190fbcfa6d11..6dc6d491f88a03d02d643a59dda67c1c2b51cce1 100644 (file)
@@ -27,7 +27,7 @@ static int alpine_boot_secondary(unsigned int cpu, struct task_struct *idle)
 {
        phys_addr_t addr;
 
-       addr = virt_to_phys(secondary_startup);
+       addr = __pa_symbol(secondary_startup);
 
        if (addr > (phys_addr_t)(uint32_t)(-1)) {
                pr_err("FAIL: resume address over 32bit (%pa)", &addr);
index ffbd71d45008a00639d8a496605150a3755caac0..502e3df69f696e4a740cd09def122a923386e7c9 100644 (file)
@@ -25,7 +25,7 @@
 static void write_release_addr(u32 release_phys)
 {
        u32 *virt = (u32 *) phys_to_virt(release_phys);
-       writel_relaxed(virt_to_phys(secondary_startup), virt);
+       writel_relaxed(__pa_symbol(secondary_startup), virt);
        /* Make sure this store is visible to other CPUs */
        smp_wmb();
        __cpuc_flush_dcache_area(virt, sizeof(u32));
index 9b6727ed68cd711632f6dd9fb53d9519f0d02683..f5fb10b4376f7391bc93f5c3a44edad3612c9864 100644 (file)
@@ -135,7 +135,7 @@ static int bcm63138_smp_boot_secondary(unsigned int cpu,
        }
 
        /* Write the secondary init routine to the BootLUT reset vector */
-       val = virt_to_phys(secondary_startup);
+       val = __pa_symbol(secondary_startup);
        writel_relaxed(val, bootlut_base + BOOTLUT_RESET_VECT);
 
        /* Power up the core, will jump straight to its reset vector when we
index 40dc8448445e6e2741c8c9a91ac5cd7768809bd2..12379960e982de5ea427bfee05a28d05a78af680 100644 (file)
@@ -151,7 +151,7 @@ static void brcmstb_cpu_boot(u32 cpu)
         * Set the reset vector to point to the secondary_startup
         * routine
         */
-       cpu_set_boot_addr(cpu, virt_to_phys(secondary_startup));
+       cpu_set_boot_addr(cpu, __pa_symbol(secondary_startup));
 
        /* Unhalt the cpu */
        cpu_rst_cfg_set(cpu, 0);
index 3ac3a9bc663c5889a373a798883205894589d488..582886d0d02f7243d5117a59d88c356ceb4fd11a 100644 (file)
@@ -116,7 +116,7 @@ static int nsp_write_lut(unsigned int cpu)
                return -ENOMEM;
        }
 
-       secondary_startup_phy = virt_to_phys(secondary_startup);
+       secondary_startup_phy = __pa_symbol(secondary_startup);
        BUG_ON(secondary_startup_phy > (phys_addr_t)U32_MAX);
 
        writel_relaxed(secondary_startup_phy, sku_rom_lut);
@@ -189,7 +189,7 @@ static int kona_boot_secondary(unsigned int cpu, struct task_struct *idle)
         * Secondary cores will start in secondary_startup(),
         * defined in "arch/arm/kernel/head.S"
         */
-       boot_func = virt_to_phys(secondary_startup);
+       boot_func = __pa_symbol(secondary_startup);
        BUG_ON(boot_func & BOOT_ADDR_CPUID_MASK);
        BUG_ON(boot_func > (phys_addr_t)U32_MAX);
 
index 93f90688db18301b081d045a1089fe14e63ac2c1..7586b7aec272c0c5b3786d166d026c4f96af4afd 100644 (file)
@@ -15,6 +15,7 @@
 
 #include <asm/cacheflush.h>
 #include <asm/cp15.h>
+#include <asm/memory.h>
 #include <asm/smp_plat.h>
 #include <asm/smp_scu.h>
 
@@ -75,7 +76,7 @@ static void __init berlin_smp_prepare_cpus(unsigned int max_cpus)
        if (!cpu_ctrl)
                goto unmap_scu;
 
-       vectors_base = ioremap(CONFIG_VECTORS_BASE, SZ_32K);
+       vectors_base = ioremap(VECTORS_BASE, SZ_32K);
        if (!vectors_base)
                goto unmap_scu;
 
@@ -92,7 +93,7 @@ static void __init berlin_smp_prepare_cpus(unsigned int max_cpus)
         * Write the secondary startup address into the SW reset address
         * vector. This is used by boot_inst.
         */
-       writel(virt_to_phys(secondary_startup), vectors_base + SW_RESET_ADDR);
+       writel(__pa_symbol(secondary_startup), vectors_base + SW_RESET_ADDR);
 
        iounmap(vectors_base);
 unmap_scu:
index fd6da5419b5107e99346ae12c6c9d6d381ae0be6..e81a78b125d98af5f76568bb72f701400c45c2a9 100644 (file)
@@ -41,7 +41,7 @@ static int exynos_do_idle(unsigned long mode)
        case FW_DO_IDLE_AFTR:
                if (read_cpuid_part() == ARM_CPU_PART_CORTEX_A9)
                        exynos_save_cp15();
-               writel_relaxed(virt_to_phys(exynos_cpu_resume_ns),
+               writel_relaxed(__pa_symbol(exynos_cpu_resume_ns),
                               sysram_ns_base_addr + 0x24);
                writel_relaxed(EXYNOS_AFTR_MAGIC, sysram_ns_base_addr + 0x20);
                if (soc_is_exynos3250()) {
@@ -135,7 +135,7 @@ static int exynos_suspend(void)
                exynos_save_cp15();
 
        writel(EXYNOS_SLEEP_MAGIC, sysram_ns_base_addr + EXYNOS_BOOT_FLAG);
-       writel(virt_to_phys(exynos_cpu_resume_ns),
+       writel(__pa_symbol(exynos_cpu_resume_ns),
                sysram_ns_base_addr + EXYNOS_BOOT_ADDR);
 
        return cpu_suspend(0, exynos_cpu_suspend);
index 038fd8c993d0e6aa58acd7698be6c29cd287d4cc..b42622562ea79871b5c0ccaa960011cd6eef8c6d 100644 (file)
@@ -221,7 +221,7 @@ static void exynos_mcpm_setup_entry_point(void)
         */
        __raw_writel(0xe59f0000, ns_sram_base_addr);     /* ldr r0, [pc, #0] */
        __raw_writel(0xe12fff10, ns_sram_base_addr + 4); /* bx  r0 */
-       __raw_writel(virt_to_phys(mcpm_entry_point), ns_sram_base_addr + 8);
+       __raw_writel(__pa_symbol(mcpm_entry_point), ns_sram_base_addr + 8);
 }
 
 static struct syscore_ops exynos_mcpm_syscore_ops = {
index a5d68411a037994cfcf7f3c2b62c0afb5d91617f..5a03bffe7226030fe528eaf9c4c683b92822b4fd 100644 (file)
@@ -353,7 +353,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
 
                smp_rmb();
 
-               boot_addr = virt_to_phys(exynos4_secondary_startup);
+               boot_addr = __pa_symbol(exynos4_secondary_startup);
 
                ret = exynos_set_boot_addr(core_id, boot_addr);
                if (ret)
@@ -413,7 +413,7 @@ static void __init exynos_smp_prepare_cpus(unsigned int max_cpus)
 
                mpidr = cpu_logical_map(i);
                core_id = MPIDR_AFFINITY_LEVEL(mpidr, 0);
-               boot_addr = virt_to_phys(exynos4_secondary_startup);
+               boot_addr = __pa_symbol(exynos4_secondary_startup);
 
                ret = exynos_set_boot_addr(core_id, boot_addr);
                if (ret)
index 487295f4a56bc07ab2412ca73538f539421234d6..1a7e5b5d08d83234ff58a6a75d6e0c777f0be9c6 100644 (file)
@@ -132,7 +132,7 @@ static void exynos_set_wakeupmask(long mask)
 
 static void exynos_cpu_set_boot_vector(long flags)
 {
-       writel_relaxed(virt_to_phys(exynos_cpu_resume),
+       writel_relaxed(__pa_symbol(exynos_cpu_resume),
                       exynos_boot_vector_addr());
        writel_relaxed(flags, exynos_boot_vector_flag());
 }
@@ -238,7 +238,7 @@ static int exynos_cpu0_enter_aftr(void)
 
 abort:
        if (cpu_online(1)) {
-               unsigned long boot_addr = virt_to_phys(exynos_cpu_resume);
+               unsigned long boot_addr = __pa_symbol(exynos_cpu_resume);
 
                /*
                 * Set the boot vector to something non-zero
@@ -330,7 +330,7 @@ cpu1_aborted:
 
 static void exynos_pre_enter_aftr(void)
 {
-       unsigned long boot_addr = virt_to_phys(exynos_cpu_resume);
+       unsigned long boot_addr = __pa_symbol(exynos_cpu_resume);
 
        (void)exynos_set_boot_addr(1, boot_addr);
 }
index adf4e8f182bd650eb0ce7e3ab148492fae53b58e..748cfb8d521247c2073b8cc064d963c1f7ab9eeb 100644 (file)
@@ -301,7 +301,7 @@ static void exynos_pm_prepare(void)
        exynos_pm_enter_sleep_mode();
 
        /* ensure at least INFORM0 has the resume address */
-       pmu_raw_writel(virt_to_phys(exynos_cpu_resume), S5P_INFORM0);
+       pmu_raw_writel(__pa_symbol(exynos_cpu_resume), S5P_INFORM0);
 }
 
 static void exynos3250_pm_prepare(void)
@@ -318,7 +318,7 @@ static void exynos3250_pm_prepare(void)
        exynos_pm_enter_sleep_mode();
 
        /* ensure at least INFORM0 has the resume address */
-       pmu_raw_writel(virt_to_phys(exynos_cpu_resume), S5P_INFORM0);
+       pmu_raw_writel(__pa_symbol(exynos_cpu_resume), S5P_INFORM0);
 }
 
 static void exynos5420_pm_prepare(void)
@@ -343,7 +343,7 @@ static void exynos5420_pm_prepare(void)
 
        /* ensure at least INFORM0 has the resume address */
        if (IS_ENABLED(CONFIG_EXYNOS5420_MCPM))
-               pmu_raw_writel(virt_to_phys(mcpm_entry_point), S5P_INFORM0);
+               pmu_raw_writel(__pa_symbol(mcpm_entry_point), S5P_INFORM0);
 
        tmp = pmu_raw_readl(EXYNOS_L2_OPTION(0));
        tmp &= ~EXYNOS_L2_USE_RETENTION;
index 4b653a8cb75ce82f2a5ff163bec263591c1a2983..a6c117622d67619ae4eccc7f720b785878935585 100644 (file)
@@ -327,7 +327,7 @@ static int __init hip04_smp_init(void)
         */
        writel_relaxed(hip04_boot_method[0], relocation);
        writel_relaxed(0xa5a5a5a5, relocation + 4);     /* magic number */
-       writel_relaxed(virt_to_phys(secondary_startup), relocation + 8);
+       writel_relaxed(__pa_symbol(secondary_startup), relocation + 8);
        writel_relaxed(0, relocation + 12);
        iounmap(relocation);
 
index e1d67648d5d02ed8d96b3ce6314691459ea2e0fe..91bb02dec20f15a63f438f3ab65fe5dc62a40af7 100644 (file)
@@ -28,7 +28,7 @@ void hi3xxx_set_cpu_jump(int cpu, void *jump_addr)
        cpu = cpu_logical_map(cpu);
        if (!cpu || !ctrl_base)
                return;
-       writel_relaxed(virt_to_phys(jump_addr), ctrl_base + ((cpu - 1) << 2));
+       writel_relaxed(__pa_symbol(jump_addr), ctrl_base + ((cpu - 1) << 2));
 }
 
 int hi3xxx_get_cpu_jump(int cpu)
@@ -118,7 +118,7 @@ static int hix5hd2_boot_secondary(unsigned int cpu, struct task_struct *idle)
 {
        phys_addr_t jumpaddr;
 
-       jumpaddr = virt_to_phys(secondary_startup);
+       jumpaddr = __pa_symbol(secondary_startup);
        hix5hd2_set_scu_boot_addr(HIX5HD2_BOOT_ADDRESS, jumpaddr);
        hix5hd2_set_cpu(cpu, true);
        arch_send_wakeup_ipi_mask(cpumask_of(cpu));
@@ -156,7 +156,7 @@ static int hip01_boot_secondary(unsigned int cpu, struct task_struct *idle)
        struct device_node *node;
 
 
-       jumpaddr = virt_to_phys(secondary_startup);
+       jumpaddr = __pa_symbol(secondary_startup);
        hip01_set_boot_addr(HIP01_BOOT_ADDRESS, jumpaddr);
 
        node = of_find_compatible_node(NULL, NULL, "hisilicon,hip01-sysctrl");
index 711dbbd5baddaac8953d2d90ac1232776efd9f16..c2d1b329fba13da1511af56d0305a01608d55511 100644 (file)
@@ -117,7 +117,7 @@ static void __init ls1021a_smp_prepare_cpus(unsigned int max_cpus)
        dcfg_base = of_iomap(np, 0);
        BUG_ON(!dcfg_base);
 
-       paddr = virt_to_phys(secondary_startup);
+       paddr = __pa_symbol(secondary_startup);
        writel_relaxed(cpu_to_be32(paddr), dcfg_base + DCFG_CCSR_SCRATCHRW1);
 
        iounmap(dcfg_base);
index 1515e498d348c6ea1636149e35665b4fd2e2b239..e61b1d1027e12d5c5284f74f4480dbba70b47b05 100644 (file)
@@ -499,7 +499,7 @@ static int __init imx6q_suspend_init(const struct imx6_pm_socdata *socdata)
        memset(suspend_ocram_base, 0, sizeof(*pm_info));
        pm_info = suspend_ocram_base;
        pm_info->pbase = ocram_pbase;
-       pm_info->resume_addr = virt_to_phys(v7_cpu_resume);
+       pm_info->resume_addr = __pa_symbol(v7_cpu_resume);
        pm_info->pm_info_size = sizeof(*pm_info);
 
        /*
index 70b083fe934a8f7763cd4cfc4dca0fea73f66d3d..495d85d0fe7ef64daf560a9ef53914b4f82fde08 100644 (file)
@@ -99,7 +99,7 @@ void imx_enable_cpu(int cpu, bool enable)
 void imx_set_cpu_jump(int cpu, void *jump_addr)
 {
        cpu = cpu_logical_map(cpu);
-       writel_relaxed(virt_to_phys(jump_addr),
+       writel_relaxed(__pa_symbol(jump_addr),
                       src_base + SRC_GPR1 + cpu * 8);
 }
 
index b821e34474b6fc5a950e1739365b2442cac0a2c5..726eb69bb655decfbd7a817bdddcb8946c761dc4 100644 (file)
@@ -122,7 +122,7 @@ static void __init __mtk_smp_prepare_cpus(unsigned int max_cpus, int trustzone)
         * write the address of slave startup address into the system-wide
         * jump register
         */
-       writel_relaxed(virt_to_phys(secondary_startup_arm),
+       writel_relaxed(__pa_symbol(secondary_startup_arm),
                        mtk_smp_base + mtk_smp_info->jump_reg);
 }
 
index 2990c5269b18586be26dbf3405f0aa696d4d0973..c487be61d6d8c9f00ceb6e131f9ab364d4e56fdd 100644 (file)
@@ -110,7 +110,7 @@ static void mvebu_pm_store_armadaxp_bootinfo(u32 *store_addr)
 {
        phys_addr_t resume_pc;
 
-       resume_pc = virt_to_phys(armada_370_xp_cpu_resume);
+       resume_pc = __pa_symbol(armada_370_xp_cpu_resume);
 
        /*
         * The bootloader expects the first two words to be a magic
index f39bd51bce18b85784e3269a61ddf82a7119d3f8..27a78c80e5b17352aafc4924d04694217bfa5c15 100644 (file)
@@ -112,7 +112,7 @@ static const struct of_device_id of_pmsu_table[] = {
 
 void mvebu_pmsu_set_cpu_boot_addr(int hw_cpu, void *boot_addr)
 {
-       writel(virt_to_phys(boot_addr), pmsu_mp_base +
+       writel(__pa_symbol(boot_addr), pmsu_mp_base +
                PMSU_BOOT_ADDR_REDIRECT_OFFSET(hw_cpu));
 }
 
index 76cbc82a7407e48d03317db872ddad26b8e24de4..04d9ebe6a90a0ccfe427fae9922ef9198bab3a37 100644 (file)
@@ -153,7 +153,7 @@ void mvebu_system_controller_set_cpu_boot_addr(void *boot_addr)
        if (of_machine_is_compatible("marvell,armada375"))
                mvebu_armada375_smp_wa_init();
 
-       writel(virt_to_phys(boot_addr), system_controller_base +
+       writel(__pa_symbol(boot_addr), system_controller_base +
               mvebu_sc->resume_boot_addr);
 }
 #endif
index 1662071bb2cc8361023aa3a066ab963bbd411540..bd8089ff929f61847bf4fc49d4efe0c6fe9c6136 100644 (file)
@@ -315,15 +315,15 @@ void omap3_save_scratchpad_contents(void)
        scratchpad_contents.boot_config_ptr = 0x0;
        if (cpu_is_omap3630())
                scratchpad_contents.public_restore_ptr =
-                       virt_to_phys(omap3_restore_3630);
+                       __pa_symbol(omap3_restore_3630);
        else if (omap_rev() != OMAP3430_REV_ES3_0 &&
                                        omap_rev() != OMAP3430_REV_ES3_1 &&
                                        omap_rev() != OMAP3430_REV_ES3_1_2)
                scratchpad_contents.public_restore_ptr =
-                       virt_to_phys(omap3_restore);
+                       __pa_symbol(omap3_restore);
        else
                scratchpad_contents.public_restore_ptr =
-                       virt_to_phys(omap3_restore_es3);
+                       __pa_symbol(omap3_restore_es3);
 
        if (omap_type() == OMAP2_DEVICE_TYPE_GP)
                scratchpad_contents.secure_ram_restore_ptr = 0x0;
@@ -395,7 +395,7 @@ void omap3_save_scratchpad_contents(void)
        sdrc_block_contents.flags = 0x0;
        sdrc_block_contents.block_size = 0x0;
 
-       arm_context_addr = virt_to_phys(omap3_arm_context);
+       arm_context_addr = __pa_symbol(omap3_arm_context);
 
        /* Copy all the contents to the scratchpad location */
        scratchpad_address = OMAP2_L4_IO_ADDRESS(OMAP343X_SCRATCHPAD);
index 7d62ad48c7c9dd1dfb290cdcf5d61479b081d7a9..113ab2dd2ee91ccf9c238813bd6c4d7561ff7d97 100644 (file)
@@ -273,7 +273,7 @@ int omap4_enter_lowpower(unsigned int cpu, unsigned int power_state)
        cpu_clear_prev_logic_pwrst(cpu);
        pwrdm_set_next_pwrst(pm_info->pwrdm, power_state);
        pwrdm_set_logic_retst(pm_info->pwrdm, cpu_logic_state);
-       set_cpu_wakeup_addr(cpu, virt_to_phys(omap_pm_ops.resume));
+       set_cpu_wakeup_addr(cpu, __pa_symbol(omap_pm_ops.resume));
        omap_pm_ops.scu_prepare(cpu, power_state);
        l2x0_pwrst_prepare(cpu, save_state);
 
@@ -325,7 +325,7 @@ int omap4_hotplug_cpu(unsigned int cpu, unsigned int power_state)
 
        pwrdm_clear_all_prev_pwrst(pm_info->pwrdm);
        pwrdm_set_next_pwrst(pm_info->pwrdm, power_state);
-       set_cpu_wakeup_addr(cpu, virt_to_phys(omap_pm_ops.hotplug_restart));
+       set_cpu_wakeup_addr(cpu, __pa_symbol(omap_pm_ops.hotplug_restart));
        omap_pm_ops.scu_prepare(cpu, power_state);
 
        /*
@@ -467,13 +467,13 @@ void __init omap4_mpuss_early_init(void)
        sar_base = omap4_get_sar_ram_base();
 
        if (cpu_is_omap443x())
-               startup_pa = virt_to_phys(omap4_secondary_startup);
+               startup_pa = __pa_symbol(omap4_secondary_startup);
        else if (cpu_is_omap446x())
-               startup_pa = virt_to_phys(omap4460_secondary_startup);
+               startup_pa = __pa_symbol(omap4460_secondary_startup);
        else if ((__boot_cpu_mode & MODE_MASK) == HYP_MODE)
-               startup_pa = virt_to_phys(omap5_secondary_hyp_startup);
+               startup_pa = __pa_symbol(omap5_secondary_hyp_startup);
        else
-               startup_pa = virt_to_phys(omap5_secondary_startup);
+               startup_pa = __pa_symbol(omap5_secondary_startup);
 
        if (cpu_is_omap44xx())
                writel_relaxed(startup_pa, sar_base +
index b4de3da6dffa5e0593dad0ca305895fc9c341e6a..003353b0b7944d9363fb6446e683314823cd82f9 100644 (file)
@@ -316,9 +316,9 @@ static void __init omap4_smp_prepare_cpus(unsigned int max_cpus)
         * A barrier is added to ensure that write buffer is drained
         */
        if (omap_secure_apis_support())
-               omap_auxcoreboot_addr(virt_to_phys(cfg.startup_addr));
+               omap_auxcoreboot_addr(__pa_symbol(cfg.startup_addr));
        else
-               writel_relaxed(virt_to_phys(cfg.startup_addr),
+               writel_relaxed(__pa_symbol(cfg.startup_addr),
                               base + OMAP_AUX_CORE_BOOT_1);
 }
 
index 0875b99add1870dab41b3b6b029a9a1ef61f708d..75ef5d4be554ce9f8564f347e52da1e6766bf5ac 100644 (file)
@@ -65,7 +65,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
         * waiting for. This would wake up the secondary core from WFE
         */
 #define SIRFSOC_CPU1_JUMPADDR_OFFSET 0x2bc
-       __raw_writel(virt_to_phys(sirfsoc_secondary_startup),
+       __raw_writel(__pa_symbol(sirfsoc_secondary_startup),
                clk_base + SIRFSOC_CPU1_JUMPADDR_OFFSET);
 
 #define SIRFSOC_CPU1_WAKEMAGIC_OFFSET 0x2b8
index 83e94c95e314414a6d85e145c9e204f186f9cc75..b0bcf1ff02dd058687a00f173090bc3183a22b63 100644 (file)
@@ -54,7 +54,7 @@ static void sirfsoc_set_sleep_mode(u32 mode)
 
 static int sirfsoc_pre_suspend_power_off(void)
 {
-       u32 wakeup_entry = virt_to_phys(cpu_resume);
+       u32 wakeup_entry = __pa_symbol(cpu_resume);
 
        sirfsoc_rtc_iobrg_writel(wakeup_entry, sirfsoc_pwrc_base +
                SIRFSOC_PWRC_SCRATCH_PAD1);
index 9c308de158c6fa2c0cf07a331c68292f8d4788b6..29630061e7007f8365be536d836b617ebd57ae01 100644 (file)
@@ -249,7 +249,7 @@ static int palmz72_pm_suspend(void)
        store_ptr = *PALMZ72_SAVE_DWORD;
 
        /* Setting PSPR to a proper value */
-       PSPR = virt_to_phys(&palmz72_resume_info);
+       PSPR = __pa_symbol(&palmz72_resume_info);
 
        return 0;
 }
index c725baf119e1135e0b8f796c18e3684dc5b74862..ba431fad5c47fd4456871083c5f81092da6814da 100644 (file)
@@ -85,7 +85,7 @@ static void pxa25x_cpu_pm_enter(suspend_state_t state)
 static int pxa25x_cpu_pm_prepare(void)
 {
        /* set resume return address */
-       PSPR = virt_to_phys(cpu_resume);
+       PSPR = __pa_symbol(cpu_resume);
        return 0;
 }
 
index c0185c5c5a08b4bcc03ff110909bf7dca46c4732..9b69be4e9fe33156837fb7d520476d47cd1e23d1 100644 (file)
@@ -168,7 +168,7 @@ static int pxa27x_cpu_pm_valid(suspend_state_t state)
 static int pxa27x_cpu_pm_prepare(void)
 {
        /* set resume return address */
-       PSPR = virt_to_phys(cpu_resume);
+       PSPR = __pa_symbol(cpu_resume);
        return 0;
 }
 
index 87acc96388c7347949c685e55a5e3b2832a2daf2..0cc9f124c9ac3769c73d52973a34c959eb56ad0a 100644 (file)
@@ -123,7 +123,7 @@ static void pxa3xx_cpu_pm_suspend(void)
        PSPR = 0x5c014000;
 
        /* overwrite with the resume address */
-       *p = virt_to_phys(cpu_resume);
+       *p = __pa_symbol(cpu_resume);
 
        cpu_suspend(0, pxa3xx_finish_suspend);
 
index 70ca99eb52c6ce8d537d0148bd71bc8bbf04282e..c242423bf8db5a5e64d9818f01c42e0b23b699fd 100644 (file)
@@ -76,7 +76,7 @@ static void __init realview_smp_prepare_cpus(unsigned int max_cpus)
        }
        /* Put the boot address in this magic register */
        regmap_write(map, REALVIEW_SYS_FLAGSSET_OFFSET,
-                    virt_to_phys(versatile_secondary_startup));
+                    __pa_symbol(versatile_secondary_startup));
 }
 
 static const struct smp_operations realview_dt_smp_ops __initconst = {
index 4d827a069d49c7a9a5dcae3ce9b2f81d807b3578..3abafdbdd7f4a24d7afed4aa53f196aab838ac92 100644 (file)
@@ -156,7 +156,7 @@ static int rockchip_boot_secondary(unsigned int cpu, struct task_struct *idle)
                 */
                mdelay(1); /* ensure the cpus other than cpu0 to startup */
 
-               writel(virt_to_phys(secondary_startup), sram_base_addr + 8);
+               writel(__pa_symbol(secondary_startup), sram_base_addr + 8);
                writel(0xDEADBEAF, sram_base_addr + 4);
                dsb_sev();
        }
@@ -195,7 +195,7 @@ static int __init rockchip_smp_prepare_sram(struct device_node *node)
        }
 
        /* set the boot function for the sram code */
-       rockchip_boot_fn = virt_to_phys(secondary_startup);
+       rockchip_boot_fn = __pa_symbol(secondary_startup);
 
        /* copy the trampoline to sram, that runs during startup of the core */
        memcpy(sram_base_addr, &rockchip_secondary_trampoline, trampoline_sz);
index bee8c80519299269cde5ba9c852243d6e5a47b05..0592534e0b88c47c203686faafc8ba5650cd7d54 100644 (file)
@@ -62,7 +62,7 @@ static inline u32 rk3288_l2_config(void)
 static void rk3288_config_bootdata(void)
 {
        rkpm_bootdata_cpusp = rk3288_bootram_phy + (SZ_4K - 8);
-       rkpm_bootdata_cpu_code = virt_to_phys(cpu_resume);
+       rkpm_bootdata_cpu_code = __pa_symbol(cpu_resume);
 
        rkpm_bootdata_l2ctlr_f  = 1;
        rkpm_bootdata_l2ctlr = rk3288_l2_config();
index 895aca225952d62f137aae798f603fa2006fc60e..f5b5c49b56ac0b561576ac64acef9c1846fa6020 100644 (file)
@@ -484,7 +484,7 @@ static int jive_pm_suspend(void)
         * correct address to resume from. */
 
        __raw_writel(0x2BED, S3C2412_INFORM0);
-       __raw_writel(virt_to_phys(s3c_cpu_resume), S3C2412_INFORM1);
+       __raw_writel(__pa_symbol(s3c_cpu_resume), S3C2412_INFORM1);
 
        return 0;
 }
index 20e481d8a33a60ff10ab3a3962bf78d5b0241c0c..a4588daeddb0f6ab94b85f837c2ccdb0791c5816 100644 (file)
@@ -45,7 +45,7 @@ static void s3c2410_pm_prepare(void)
 {
        /* ensure at least GSTATUS3 has the resume address */
 
-       __raw_writel(virt_to_phys(s3c_cpu_resume), S3C2410_GSTATUS3);
+       __raw_writel(__pa_symbol(s3c_cpu_resume), S3C2410_GSTATUS3);
 
        S3C_PMDBG("GSTATUS3 0x%08x\n", __raw_readl(S3C2410_GSTATUS3));
        S3C_PMDBG("GSTATUS4 0x%08x\n", __raw_readl(S3C2410_GSTATUS4));
index c0e328e37bd63927fd48b6cc415fef6bc86bdd44..b5bbf0d5985c818947e9b7a44c2a966e0c6c995b 100644 (file)
@@ -48,7 +48,7 @@ static void s3c2416_pm_prepare(void)
         * correct address to resume from.
         */
        __raw_writel(0x2BED, S3C2412_INFORM0);
-       __raw_writel(virt_to_phys(s3c_cpu_resume), S3C2412_INFORM1);
+       __raw_writel(__pa_symbol(s3c_cpu_resume), S3C2412_INFORM1);
 }
 
 static int s3c2416_pm_add(struct device *dev, struct subsys_interface *sif)
index b0be382ff6bb7fa7debce8a9f7cbc2b2384e2bc1..2f579be8fe677adc72032ca91a647ac046f02a4a 100644 (file)
@@ -304,7 +304,7 @@ static void s3c64xx_pm_prepare(void)
                              wake_irqs, ARRAY_SIZE(wake_irqs));
 
        /* store address of resume. */
-       __raw_writel(virt_to_phys(s3c_cpu_resume), S3C64XX_INFORM0);
+       __raw_writel(__pa_symbol(s3c_cpu_resume), S3C64XX_INFORM0);
 
        /* ensure previous wakeup state is cleared before sleeping */
        __raw_writel(__raw_readl(S3C64XX_WAKEUP_STAT), S3C64XX_WAKEUP_STAT);
index 7d69666de5ba2dd6b3c5c705d608ba4ad182767b..07cee14a363b05eb1ca99418eb8d3c0074d0b04d 100644 (file)
@@ -69,7 +69,7 @@ static void s5pv210_pm_prepare(void)
        __raw_writel(s5pv210_irqwake_intmask, S5P_WAKEUP_MASK);
 
        /* ensure at least INFORM0 has the resume address */
-       __raw_writel(virt_to_phys(s5pv210_cpu_resume), S5P_INFORM0);
+       __raw_writel(__pa_symbol(s5pv210_cpu_resume), S5P_INFORM0);
 
        tmp = __raw_readl(S5P_SLEEP_CFG);
        tmp &= ~(S5P_SLEEP_CFG_OSC_EN | S5P_SLEEP_CFG_USBOSC_EN);
index 34853d5dfda28b9e5d5f79205c30c1a71facf5d8..9a7079f565bd394c4ae1436e8a2b14cd52acba34 100644 (file)
@@ -73,7 +73,7 @@ static int sa11x0_pm_enter(suspend_state_t state)
        RCSR = RCSR_HWR | RCSR_SWR | RCSR_WDR | RCSR_SMR;
 
        /* set resume return address */
-       PSPR = virt_to_phys(cpu_resume);
+       PSPR = __pa_symbol(cpu_resume);
 
        /* go zzz */
        cpu_suspend(0, sa1100_finish_suspend);
index e19266844e16126b855bd9ab1f82e3ae5fa7a5f4..3ca2c13346f0cbc35d291cfdb28c966b71aba67e 100644 (file)
@@ -190,7 +190,7 @@ static void apmu_parse_dt(void (*fn)(struct resource *res, int cpu, int bit))
 static void __init shmobile_smp_apmu_setup_boot(void)
 {
        /* install boot code shared by all CPUs */
-       shmobile_boot_fn = virt_to_phys(shmobile_smp_boot);
+       shmobile_boot_fn = __pa_symbol(shmobile_smp_boot);
 }
 
 void __init shmobile_smp_apmu_prepare_cpus(unsigned int max_cpus,
@@ -204,7 +204,7 @@ void __init shmobile_smp_apmu_prepare_cpus(unsigned int max_cpus,
 int shmobile_smp_apmu_boot_secondary(unsigned int cpu, struct task_struct *idle)
 {
        /* For this particular CPU register boot vector */
-       shmobile_smp_hook(cpu, virt_to_phys(secondary_startup), 0);
+       shmobile_smp_hook(cpu, __pa_symbol(secondary_startup), 0);
 
        return apmu_wrap(cpu, apmu_power_on);
 }
@@ -308,7 +308,7 @@ int shmobile_smp_apmu_cpu_kill(unsigned int cpu)
 #if defined(CONFIG_SUSPEND)
 static int shmobile_smp_apmu_do_suspend(unsigned long cpu)
 {
-       shmobile_smp_hook(cpu, virt_to_phys(cpu_resume), 0);
+       shmobile_smp_hook(cpu, __pa_symbol(cpu_resume), 0);
        shmobile_smp_apmu_cpu_shutdown(cpu);
        cpu_do_idle(); /* WFI selects Core Standby */
        return 1;
index d1ecaf37d1422d214d56177f42da9af9529454fb..f1a1efde4beb19c2b520188aa9dac6fed428c224 100644 (file)
@@ -24,7 +24,7 @@ static void __iomem *shmobile_scu_base;
 static int shmobile_scu_cpu_prepare(unsigned int cpu)
 {
        /* For this particular CPU register SCU SMP boot vector */
-       shmobile_smp_hook(cpu, virt_to_phys(shmobile_boot_scu),
+       shmobile_smp_hook(cpu, __pa_symbol(shmobile_boot_scu),
                          shmobile_scu_base_phys);
        return 0;
 }
@@ -33,7 +33,7 @@ void __init shmobile_smp_scu_prepare_cpus(phys_addr_t scu_base_phys,
                                          unsigned int max_cpus)
 {
        /* install boot code shared by all CPUs */
-       shmobile_boot_fn = virt_to_phys(shmobile_smp_boot);
+       shmobile_boot_fn = __pa_symbol(shmobile_smp_boot);
 
        /* enable SCU and cache coherency on booting CPU */
        shmobile_scu_base_phys = scu_base_phys;
index 07945748b57141f2c216d729a6f895d36227be7a..0ee76772b50743f099637e18b4859d5736644b23 100644 (file)
@@ -40,7 +40,7 @@ static int socfpga_boot_secondary(unsigned int cpu, struct task_struct *idle)
 
                memcpy(phys_to_virt(0), &secondary_trampoline, trampoline_size);
 
-               writel(virt_to_phys(secondary_startup),
+               writel(__pa_symbol(secondary_startup),
                       sys_manager_base_addr + (socfpga_cpu1start_addr & 0x000000ff));
 
                flush_cache_all();
@@ -63,7 +63,7 @@ static int socfpga_a10_boot_secondary(unsigned int cpu, struct task_struct *idle
                       SOCFPGA_A10_RSTMGR_MODMPURST);
                memcpy(phys_to_virt(0), &secondary_trampoline, trampoline_size);
 
-               writel(virt_to_phys(secondary_startup),
+               writel(__pa_symbol(secondary_startup),
                       sys_manager_base_addr + (socfpga_cpu1start_addr & 0x00000fff));
 
                flush_cache_all();
index 8d1e2d55178684cd96c49828eb5390e24286ff7c..39038a03836acb8f3288488f063a99d5ef0f814c 100644 (file)
@@ -117,7 +117,7 @@ static void __init spear13xx_smp_prepare_cpus(unsigned int max_cpus)
         * (presently it is in SRAM). The BootMonitor waits until it receives a
         * soft interrupt, and then the secondary CPU branches to this address.
         */
-       __raw_writel(virt_to_phys(spear13xx_secondary_startup), SYS_LOCATION);
+       __raw_writel(__pa_symbol(spear13xx_secondary_startup), SYS_LOCATION);
 }
 
 const struct smp_operations spear13xx_smp_ops __initconst = {
index ea5a2277ee46b4132edfea00fd6e6c07a91be630..231f19e174365229f034c9897b6e61984b92d84c 100644 (file)
@@ -103,7 +103,7 @@ static void __init sti_smp_prepare_cpus(unsigned int max_cpus)
        u32 __iomem *cpu_strt_ptr;
        u32 release_phys;
        int cpu;
-       unsigned long entry_pa = virt_to_phys(sti_secondary_startup);
+       unsigned long entry_pa = __pa_symbol(sti_secondary_startup);
 
        np = of_find_compatible_node(NULL, NULL, "arm,cortex-a9-scu");
 
index 6642267812c96fc3f2b0e1f694b5bf3667dd20ed..8fb5088464db3dc932d367ff8272e303d1401b48 100644 (file)
@@ -80,7 +80,7 @@ static int sun6i_smp_boot_secondary(unsigned int cpu,
        spin_lock(&cpu_lock);
 
        /* Set CPU boot address */
-       writel(virt_to_phys(secondary_startup),
+       writel(__pa_symbol(secondary_startup),
               cpucfg_membase + CPUCFG_PRIVATE0_REG);
 
        /* Assert the CPU core in reset */
@@ -162,7 +162,7 @@ static int sun8i_smp_boot_secondary(unsigned int cpu,
        spin_lock(&cpu_lock);
 
        /* Set CPU boot address */
-       writel(virt_to_phys(secondary_startup),
+       writel(__pa_symbol(secondary_startup),
               cpucfg_membase + CPUCFG_PRIVATE0_REG);
 
        /* Assert the CPU core in reset */
index 98c62a4a8623df12f3e9a4d2fdd6118af7befd75..2f0c6c050fed742de088c13b74e70f1537ca70ae 100644 (file)
@@ -5,7 +5,7 @@
 
 static int tango_boot_secondary(unsigned int cpu, struct task_struct *idle)
 {
-       tango_set_aux_boot_addr(virt_to_phys(secondary_startup));
+       tango_set_aux_boot_addr(__pa_symbol(secondary_startup));
        tango_start_aux_core(cpu);
        return 0;
 }
index b05c6d6f99d072b5ef006004475776d14513745c..406c0814eb6e6195bf61990d73febbec8109bdd7 100644 (file)
@@ -5,7 +5,7 @@
 
 static int tango_pm_powerdown(unsigned long arg)
 {
-       tango_suspend(virt_to_phys(cpu_resume));
+       tango_suspend(__pa_symbol(cpu_resume));
 
        return -EIO; /* tango_suspend has failed */
 }
index 6fd9db54887eeebd400e425a216bce2cce9399b2..dc558892753c69c3c12829d27f03282f9ae1e49b 100644 (file)
@@ -94,14 +94,14 @@ void __init tegra_cpu_reset_handler_init(void)
        __tegra_cpu_reset_handler_data[TEGRA_RESET_MASK_PRESENT] =
                *((u32 *)cpu_possible_mask);
        __tegra_cpu_reset_handler_data[TEGRA_RESET_STARTUP_SECONDARY] =
-               virt_to_phys((void *)secondary_startup);
+               __pa_symbol((void *)secondary_startup);
 #endif
 
 #ifdef CONFIG_PM_SLEEP
        __tegra_cpu_reset_handler_data[TEGRA_RESET_STARTUP_LP1] =
                TEGRA_IRAM_LPx_RESUME_AREA;
        __tegra_cpu_reset_handler_data[TEGRA_RESET_STARTUP_LP2] =
-               virt_to_phys((void *)tegra_resume);
+               __pa_symbol((void *)tegra_resume);
 #endif
 
        tegra_cpu_reset_handler_enable();
index e0ee139fdebfeffbd51918948d3e94909ccc89b6..9b124c22035f4a336456eaaa5afc92799f4c68d2 100644 (file)
@@ -79,7 +79,7 @@ static int ux500_boot_secondary(unsigned int cpu, struct task_struct *idle)
         * backup ram register at offset 0x1FF0, which is what boot rom code
         * is waiting for. This will wake up the secondary core from WFE.
         */
-       writel(virt_to_phys(secondary_startup),
+       writel(__pa_symbol(secondary_startup),
               backupram + UX500_CPU1_JUMPADDR_OFFSET);
        writel(0xA1FEED01,
               backupram + UX500_CPU1_WAKEMAGIC_OFFSET);
index 5cedcf572104bcdf82b1b75ab61f89f4cdf19651..ee2a0faafaa19309ca660543b71819cee82fc72c 100644 (file)
@@ -166,7 +166,7 @@ static int __init dcscb_init(void)
         * Future entries into the kernel can now go
         * through the cluster entry vectors.
         */
-       vexpress_flags_set(virt_to_phys(mcpm_entry_point));
+       vexpress_flags_set(__pa_symbol(mcpm_entry_point));
 
        return 0;
 }
index 98e29dee91e865f7c76408c845e99ff2af14ca92..742499bac6d09f27eaed8580e2df4b06e63a3252 100644 (file)
@@ -79,7 +79,7 @@ static void __init vexpress_smp_dt_prepare_cpus(unsigned int max_cpus)
         * until it receives a soft interrupt, and then the
         * secondary CPU branches to this address.
         */
-       vexpress_flags_set(virt_to_phys(versatile_secondary_startup));
+       vexpress_flags_set(__pa_symbol(versatile_secondary_startup));
 }
 
 const struct smp_operations vexpress_smp_dt_ops __initconst = {
index 1aa4ccece69f97cb06dacb98f146d91bde6560d2..9b5f3c427086cd28f2e3781df87b560de7161167 100644 (file)
@@ -54,7 +54,7 @@ static int tc2_pm_cpu_powerup(unsigned int cpu, unsigned int cluster)
        if (cluster >= TC2_CLUSTERS || cpu >= tc2_nr_cpus[cluster])
                return -EINVAL;
        ve_spc_set_resume_addr(cluster, cpu,
-                              virt_to_phys(mcpm_entry_point));
+                              __pa_symbol(mcpm_entry_point));
        ve_spc_cpu_wakeup_irq(cluster, cpu, true);
        return 0;
 }
@@ -159,7 +159,7 @@ static int tc2_pm_wait_for_powerdown(unsigned int cpu, unsigned int cluster)
 
 static void tc2_pm_cpu_suspend_prepare(unsigned int cpu, unsigned int cluster)
 {
-       ve_spc_set_resume_addr(cluster, cpu, virt_to_phys(mcpm_entry_point));
+       ve_spc_set_resume_addr(cluster, cpu, __pa_symbol(mcpm_entry_point));
 }
 
 static void tc2_pm_cpu_is_up(unsigned int cpu, unsigned int cluster)
index 0297f92084e048234ed557443983735079d7ac2a..afb9a82dedc314b9b63f3bb7f0cfc8c81c8a6697 100644 (file)
@@ -76,7 +76,7 @@ void __init zx_smp_prepare_cpus(unsigned int max_cpus)
         * until it receives a soft interrupt, and then the
         * secondary CPU branches to this address.
         */
-       __raw_writel(virt_to_phys(zx_secondary_startup),
+       __raw_writel(__pa_symbol(zx_secondary_startup),
                     aonsysctrl_base + AON_SYS_CTRL_RESERVED1);
 
        iounmap(aonsysctrl_base);
@@ -94,7 +94,7 @@ void __init zx_smp_prepare_cpus(unsigned int max_cpus)
 
        /* Map the first 4 KB IRAM for suspend usage */
        sys_iram = __arm_ioremap_exec(ZX_IRAM_BASE, PAGE_SIZE, false);
-       zx_secondary_startup_pa = virt_to_phys(zx_secondary_startup);
+       zx_secondary_startup_pa = __pa_symbol(zx_secondary_startup);
        fncpy(sys_iram, &zx_resume_jump, zx_suspend_iram_sz);
 }
 
index 7cd9865bdeb7bed3c46c575fb5fea5a022ac251b..caa6d5fe9078326ea65d29d8d6359e1a82306969 100644 (file)
@@ -89,7 +89,7 @@ EXPORT_SYMBOL(zynq_cpun_start);
 
 static int zynq_boot_secondary(unsigned int cpu, struct task_struct *idle)
 {
-       return zynq_cpun_start(virt_to_phys(secondary_startup), cpu);
+       return zynq_cpun_start(__pa_symbol(secondary_startup), cpu);
 }
 
 /*
index 35e3a56e5d865bc38fed7a86ce91563c08e3dfa7..c6c4c9c8824b0f4b3c354538306d1c195ad7e90d 100644 (file)
@@ -29,6 +29,7 @@ config CPU_ARM720T
        select CPU_COPY_V4WT if MMU
        select CPU_CP15_MMU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_V4WT if MMU
        help
          A 32-bit RISC processor with 8kByte Cache, Write Buffer and
@@ -46,6 +47,7 @@ config CPU_ARM740T
        select CPU_CACHE_V4
        select CPU_CP15_MPU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        help
          A 32-bit RISC processor with 8KB cache or 4KB variants,
          write buffer and MPU(Protection Unit) built around
@@ -79,6 +81,7 @@ config CPU_ARM920T
        select CPU_COPY_V4WB if MMU
        select CPU_CP15_MMU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_V4WBI if MMU
        help
          The ARM920T is licensed to be produced by numerous vendors,
@@ -97,6 +100,7 @@ config CPU_ARM922T
        select CPU_COPY_V4WB if MMU
        select CPU_CP15_MMU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_V4WBI if MMU
        help
          The ARM922T is a version of the ARM920T, but with smaller
@@ -116,6 +120,7 @@ config CPU_ARM925T
        select CPU_COPY_V4WB if MMU
        select CPU_CP15_MMU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_V4WBI if MMU
        help
          The ARM925T is a mix between the ARM920T and ARM926T, but with
@@ -134,6 +139,7 @@ config CPU_ARM926T
        select CPU_COPY_V4WB if MMU
        select CPU_CP15_MMU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_V4WBI if MMU
        help
          This is a variant of the ARM920.  It has slightly different
@@ -170,6 +176,7 @@ config CPU_ARM940T
        select CPU_CACHE_VIVT
        select CPU_CP15_MPU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        help
          ARM940T is a member of the ARM9TDMI family of general-
          purpose microprocessors with MPU and separate 4KB
@@ -188,6 +195,7 @@ config CPU_ARM946E
        select CPU_CACHE_VIVT
        select CPU_CP15_MPU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        help
          ARM946E-S is a member of the ARM9E-S family of high-
          performance, 32-bit system-on-chip processor solutions.
@@ -206,6 +214,7 @@ config CPU_ARM1020
        select CPU_COPY_V4WB if MMU
        select CPU_CP15_MMU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_V4WBI if MMU
        help
          The ARM1020 is the 32K cached version of the ARM10 processor,
@@ -225,6 +234,7 @@ config CPU_ARM1020E
        select CPU_COPY_V4WB if MMU
        select CPU_CP15_MMU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_V4WBI if MMU
 
 # ARM1022E
@@ -236,6 +246,7 @@ config CPU_ARM1022
        select CPU_COPY_V4WB if MMU # can probably do better
        select CPU_CP15_MMU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_V4WBI if MMU
        help
          The ARM1022E is an implementation of the ARMv5TE architecture
@@ -254,6 +265,7 @@ config CPU_ARM1026
        select CPU_COPY_V4WB if MMU # can probably do better
        select CPU_CP15_MMU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_V4WBI if MMU
        help
          The ARM1026EJ-S is an implementation of the ARMv5TEJ architecture
@@ -302,6 +314,7 @@ config CPU_XSCALE
        select CPU_CACHE_VIVT
        select CPU_CP15_MMU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_V4WBI if MMU
 
 # XScale Core Version 3
@@ -312,6 +325,7 @@ config CPU_XSC3
        select CPU_CACHE_VIVT
        select CPU_CP15_MMU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_V4WBI if MMU
        select IO_36
 
@@ -324,6 +338,7 @@ config CPU_MOHAWK
        select CPU_COPY_V4WB if MMU
        select CPU_CP15_MMU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_V4WBI if MMU
 
 # Feroceon
@@ -335,6 +350,7 @@ config CPU_FEROCEON
        select CPU_COPY_FEROCEON if MMU
        select CPU_CP15_MMU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_FEROCEON if MMU
 
 config CPU_FEROCEON_OLD_ID
@@ -367,6 +383,7 @@ config CPU_V6
        select CPU_CP15_MMU
        select CPU_HAS_ASID if MMU
        select CPU_PABRT_V6
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_V6 if MMU
 
 # ARMv6k
@@ -381,6 +398,7 @@ config CPU_V6K
        select CPU_CP15_MMU
        select CPU_HAS_ASID if MMU
        select CPU_PABRT_V6
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_V6 if MMU
 
 # ARMv7
@@ -396,6 +414,7 @@ config CPU_V7
        select CPU_CP15_MPU if !MMU
        select CPU_HAS_ASID if MMU
        select CPU_PABRT_V7
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_V7 if MMU
 
 # ARMv7M
@@ -410,11 +429,17 @@ config CPU_V7M
 
 config CPU_THUMBONLY
        bool
+       select CPU_THUMB_CAPABLE
        # There are no CPUs available with MMU that don't implement an ARM ISA:
        depends on !MMU
        help
          Select this if your CPU doesn't support the 32 bit ARM instructions.
 
+config CPU_THUMB_CAPABLE
+       bool
+       help
+         Select this if your CPU can support Thumb mode.
+
 # Figure out what processor architecture version we should be using.
 # This defines the compiler instruction set which depends on the machine type.
 config CPU_32v3
@@ -655,11 +680,7 @@ config ARCH_DMA_ADDR_T_64BIT
 
 config ARM_THUMB
        bool "Support Thumb user binaries" if !CPU_THUMBONLY
-       depends on CPU_ARM720T || CPU_ARM740T || CPU_ARM920T || CPU_ARM922T || \
-               CPU_ARM925T || CPU_ARM926T || CPU_ARM940T || CPU_ARM946E || \
-               CPU_ARM1020 || CPU_ARM1020E || CPU_ARM1022 || CPU_ARM1026 || \
-               CPU_XSCALE || CPU_XSC3 || CPU_MOHAWK || CPU_V6 || CPU_V6K || \
-               CPU_V7 || CPU_FEROCEON || CPU_V7M
+       depends on CPU_THUMB_CAPABLE
        default y
        help
          Say Y if you want to include kernel support for running user space
index e8698241ece904180372e1cd06101037363347ff..b3dea80715b47d3930f25d5b403aafcec3a6fb16 100644 (file)
@@ -14,6 +14,7 @@ endif
 
 obj-$(CONFIG_ARM_PTDUMP)       += dump.o
 obj-$(CONFIG_MODULES)          += proc-syms.o
+obj-$(CONFIG_DEBUG_VIRTUAL)    += physaddr.o
 
 obj-$(CONFIG_ALIGNMENT_TRAP)   += alignment.o
 obj-$(CONFIG_HIGHMEM)          += highmem.o
index dfe97b40991609fc6f66629614b7a7dea0d8f044..f57b080b6fd4055bfe9e106f44cbf8352c96c5be 100644 (file)
@@ -15,6 +15,7 @@
 
 #define pr_fmt(fmt)            "uniphier: " fmt
 
+#include <linux/bitops.h>
 #include <linux/init.h>
 #include <linux/io.h>
 #include <linux/log2.h>
@@ -71,8 +72,7 @@
  * @ctrl_base: virtual base address of control registers
  * @rev_base: virtual base address of revision registers
  * @op_base: virtual base address of operation registers
- * @way_present_mask: each bit specifies if the way is present
- * @way_locked_mask: each bit specifies if the way is locked
+ * @way_mask: each bit specifies if the way is present
  * @nsets: number of associativity sets
  * @line_size: line size in bytes
  * @range_op_max_size: max size that can be handled by a single range operation
@@ -83,8 +83,7 @@ struct uniphier_cache_data {
        void __iomem *rev_base;
        void __iomem *op_base;
        void __iomem *way_ctrl_base;
-       u32 way_present_mask;
-       u32 way_locked_mask;
+       u32 way_mask;
        u32 nsets;
        u32 line_size;
        u32 range_op_max_size;
@@ -234,17 +233,13 @@ static void __uniphier_cache_enable(struct uniphier_cache_data *data, bool on)
        writel_relaxed(val, data->ctrl_base + UNIPHIER_SSCC);
 }
 
-static void __init __uniphier_cache_set_locked_ways(
-                                       struct uniphier_cache_data *data,
-                                       u32 way_mask)
+static void __init __uniphier_cache_set_active_ways(
+                                       struct uniphier_cache_data *data)
 {
        unsigned int cpu;
 
-       data->way_locked_mask = way_mask & data->way_present_mask;
-
        for_each_possible_cpu(cpu)
-               writel_relaxed(~data->way_locked_mask & data->way_present_mask,
-                              data->way_ctrl_base + 4 * cpu);
+               writel_relaxed(data->way_mask, data->way_ctrl_base + 4 * cpu);
 }
 
 static void uniphier_cache_maint_range(unsigned long start, unsigned long end,
@@ -307,7 +302,7 @@ static void __init uniphier_cache_enable(void)
 
        list_for_each_entry(data, &uniphier_cache_list, list) {
                __uniphier_cache_enable(data, true);
-               __uniphier_cache_set_locked_ways(data, 0);
+               __uniphier_cache_set_active_ways(data);
        }
 }
 
@@ -382,8 +377,8 @@ static int __init __uniphier_cache_init(struct device_node *np,
                goto err;
        }
 
-       data->way_present_mask =
-               ((u32)1 << cache_size / data->nsets / data->line_size) - 1;
+       data->way_mask = GENMASK(cache_size / data->nsets / data->line_size - 1,
+                                0);
 
        data->ctrl_base = of_iomap(np, 0);
        if (!data->ctrl_base) {
index e309a5e2c9350788e6e651140d2c1efa03c7e667..63eabb06f9f17551695e89efc0ed59e0ce6ba186 100644 (file)
@@ -870,6 +870,9 @@ static int __arm_dma_mmap(struct device *dev, struct vm_area_struct *vma,
                                      vma->vm_end - vma->vm_start,
                                      vma->vm_page_prot);
        }
+#else
+       ret = vm_iomap_memory(vma, vma->vm_start,
+                             (vma->vm_end - vma->vm_start));
 #endif /* CONFIG_MMU */
 
        return ret;
index 9fe8e241335c6edcb0db5077f5d4621aefb68944..21192d6eda401a76787f377795c330199f881e48 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/seq_file.h>
 
 #include <asm/fixmap.h>
+#include <asm/memory.h>
 #include <asm/pgtable.h>
 
 struct addr_marker {
@@ -31,8 +32,8 @@ static struct addr_marker address_markers[] = {
        { 0,                    "vmalloc() Area" },
        { VMALLOC_END,          "vmalloc() End" },
        { FIXADDR_START,        "Fixmap Area" },
-       { CONFIG_VECTORS_BASE,  "Vectors" },
-       { CONFIG_VECTORS_BASE + PAGE_SIZE * 2, "Vectors End" },
+       { VECTORS_BASE, "Vectors" },
+       { VECTORS_BASE + PAGE_SIZE * 2, "Vectors End" },
        { -1,                   NULL },
 };
 
index 3cced8455727953a2525571c5a62b5ad884e8bee..f1e6190aa7eaedf5adacca20c09cb7c3702bee96 100644 (file)
@@ -327,6 +327,12 @@ void flush_dcache_page(struct page *page)
        if (page == ZERO_PAGE(0))
                return;
 
+       if (!cache_ops_need_broadcast() && cache_is_vipt_nonaliasing()) {
+               if (test_bit(PG_dcache_clean, &page->flags))
+                       clear_bit(PG_dcache_clean, &page->flags);
+               return;
+       }
+
        mapping = page_mapping(page);
 
        if (!cache_ops_need_broadcast() &&
index 4be0bee4c35700aacbaaa3994d5ae4a7ce92bfb5..bf4d3bc41a7a85e5144eeecb1231418f9870f54e 100644 (file)
@@ -27,6 +27,7 @@
 #include <asm/cp15.h>
 #include <asm/mach-types.h>
 #include <asm/memblock.h>
+#include <asm/memory.h>
 #include <asm/prom.h>
 #include <asm/sections.h>
 #include <asm/setup.h>
@@ -227,41 +228,59 @@ phys_addr_t __init arm_memblock_steal(phys_addr_t size, phys_addr_t align)
        return phys;
 }
 
-void __init arm_memblock_init(const struct machine_desc *mdesc)
+static void __init arm_initrd_init(void)
 {
-       /* Register the kernel text, kernel data and initrd with memblock. */
-#ifdef CONFIG_XIP_KERNEL
-       memblock_reserve(__pa(_sdata), _end - _sdata);
-#else
-       memblock_reserve(__pa(_stext), _end - _stext);
-#endif
 #ifdef CONFIG_BLK_DEV_INITRD
+       phys_addr_t start;
+       unsigned long size;
+
        /* FDT scan will populate initrd_start */
        if (initrd_start && !phys_initrd_size) {
                phys_initrd_start = __virt_to_phys(initrd_start);
                phys_initrd_size = initrd_end - initrd_start;
        }
+
        initrd_start = initrd_end = 0;
-       if (phys_initrd_size &&
-           !memblock_is_region_memory(phys_initrd_start, phys_initrd_size)) {
+
+       if (!phys_initrd_size)
+               return;
+
+       /*
+        * Round the memory region to page boundaries as per free_initrd_mem()
+        * This allows us to detect whether the pages overlapping the initrd
+        * are in use, but more importantly, reserves the entire set of pages
+        * as we don't want these pages allocated for other purposes.
+        */
+       start = round_down(phys_initrd_start, PAGE_SIZE);
+       size = phys_initrd_size + (phys_initrd_start - start);
+       size = round_up(size, PAGE_SIZE);
+
+       if (!memblock_is_region_memory(start, size)) {
                pr_err("INITRD: 0x%08llx+0x%08lx is not a memory region - disabling initrd\n",
-                      (u64)phys_initrd_start, phys_initrd_size);
-               phys_initrd_start = phys_initrd_size = 0;
+                      (u64)start, size);
+               return;
        }
-       if (phys_initrd_size &&
-           memblock_is_region_reserved(phys_initrd_start, phys_initrd_size)) {
+
+       if (memblock_is_region_reserved(start, size)) {
                pr_err("INITRD: 0x%08llx+0x%08lx overlaps in-use memory region - disabling initrd\n",
-                      (u64)phys_initrd_start, phys_initrd_size);
-               phys_initrd_start = phys_initrd_size = 0;
+                      (u64)start, size);
+               return;
        }
-       if (phys_initrd_size) {
-               memblock_reserve(phys_initrd_start, phys_initrd_size);
 
-               /* Now convert initrd to virtual addresses */
-               initrd_start = __phys_to_virt(phys_initrd_start);
-               initrd_end = initrd_start + phys_initrd_size;
-       }
+       memblock_reserve(start, size);
+
+       /* Now convert initrd to virtual addresses */
+       initrd_start = __phys_to_virt(phys_initrd_start);
+       initrd_end = initrd_start + phys_initrd_size;
 #endif
+}
+
+void __init arm_memblock_init(const struct machine_desc *mdesc)
+{
+       /* Register the kernel text, kernel data and initrd with memblock. */
+       memblock_reserve(__pa(KERNEL_START), KERNEL_END - KERNEL_START);
+
+       arm_initrd_init();
 
        arm_mm_memblock_reserve();
 
@@ -521,8 +540,7 @@ void __init mem_init(void)
                        "      .data : 0x%p" " - 0x%p" "   (%4td kB)\n"
                        "       .bss : 0x%p" " - 0x%p" "   (%4td kB)\n",
 
-                       MLK(UL(CONFIG_VECTORS_BASE), UL(CONFIG_VECTORS_BASE) +
-                               (PAGE_SIZE)),
+                       MLK(VECTORS_BASE, VECTORS_BASE + PAGE_SIZE),
 #ifdef CONFIG_HAVE_TCM
                        MLK(DTCM_OFFSET, (unsigned long) dtcm_end),
                        MLK(ITCM_OFFSET, (unsigned long) itcm_end),
index 4001dd15818d79aea7e586d19e70943c8b689975..4e016d7f37b3af6568282aa1a909a14406247bb2 100644 (file)
@@ -1152,13 +1152,12 @@ early_param("vmalloc", early_vmalloc);
 
 phys_addr_t arm_lowmem_limit __initdata = 0;
 
-void __init sanity_check_meminfo(void)
+void __init adjust_lowmem_bounds(void)
 {
        phys_addr_t memblock_limit = 0;
-       int highmem = 0;
        u64 vmalloc_limit;
        struct memblock_region *reg;
-       bool should_use_highmem = false;
+       phys_addr_t lowmem_limit = 0;
 
        /*
         * Let's use our own (unoptimized) equivalent of __pa() that is
@@ -1172,43 +1171,18 @@ void __init sanity_check_meminfo(void)
        for_each_memblock(memory, reg) {
                phys_addr_t block_start = reg->base;
                phys_addr_t block_end = reg->base + reg->size;
-               phys_addr_t size_limit = reg->size;
 
-               if (reg->base >= vmalloc_limit)
-                       highmem = 1;
-               else
-                       size_limit = vmalloc_limit - reg->base;
-
-
-               if (!IS_ENABLED(CONFIG_HIGHMEM) || cache_is_vipt_aliasing()) {
-
-                       if (highmem) {
-                               pr_notice("Ignoring RAM at %pa-%pa (!CONFIG_HIGHMEM)\n",
-                                         &block_start, &block_end);
-                               memblock_remove(reg->base, reg->size);
-                               should_use_highmem = true;
-                               continue;
-                       }
-
-                       if (reg->size > size_limit) {
-                               phys_addr_t overlap_size = reg->size - size_limit;
-
-                               pr_notice("Truncating RAM at %pa-%pa",
-                                         &block_start, &block_end);
-                               block_end = vmalloc_limit;
-                               pr_cont(" to -%pa", &block_end);
-                               memblock_remove(vmalloc_limit, overlap_size);
-                               should_use_highmem = true;
-                       }
-               }
-
-               if (!highmem) {
-                       if (block_end > arm_lowmem_limit) {
-                               if (reg->size > size_limit)
-                                       arm_lowmem_limit = vmalloc_limit;
-                               else
-                                       arm_lowmem_limit = block_end;
-                       }
+               if (reg->base < vmalloc_limit) {
+                       if (block_end > lowmem_limit)
+                               /*
+                                * Compare as u64 to ensure vmalloc_limit does
+                                * not get truncated. block_end should always
+                                * fit in phys_addr_t so there should be no
+                                * issue with assignment.
+                                */
+                               lowmem_limit = min_t(u64,
+                                                        vmalloc_limit,
+                                                        block_end);
 
                        /*
                         * Find the first non-pmd-aligned page, and point
@@ -1227,14 +1201,13 @@ void __init sanity_check_meminfo(void)
                                if (!IS_ALIGNED(block_start, PMD_SIZE))
                                        memblock_limit = block_start;
                                else if (!IS_ALIGNED(block_end, PMD_SIZE))
-                                       memblock_limit = arm_lowmem_limit;
+                                       memblock_limit = lowmem_limit;
                        }
 
                }
        }
 
-       if (should_use_highmem)
-               pr_notice("Consider using a HIGHMEM enabled kernel.\n");
+       arm_lowmem_limit = lowmem_limit;
 
        high_memory = __va(arm_lowmem_limit - 1) + 1;
 
@@ -1248,6 +1221,18 @@ void __init sanity_check_meminfo(void)
        if (!memblock_limit)
                memblock_limit = arm_lowmem_limit;
 
+       if (!IS_ENABLED(CONFIG_HIGHMEM) || cache_is_vipt_aliasing()) {
+               if (memblock_end_of_DRAM() > arm_lowmem_limit) {
+                       phys_addr_t end = memblock_end_of_DRAM();
+
+                       pr_notice("Ignoring RAM at %pa-%pa\n",
+                                 &memblock_limit, &end);
+                       pr_notice("Consider using a HIGHMEM enabled kernel.\n");
+
+                       memblock_remove(memblock_limit, end - memblock_limit);
+               }
+       }
+
        memblock_set_current_limit(memblock_limit);
 }
 
@@ -1437,11 +1422,7 @@ static void __init kmap_init(void)
 static void __init map_lowmem(void)
 {
        struct memblock_region *reg;
-#ifdef CONFIG_XIP_KERNEL
-       phys_addr_t kernel_x_start = round_down(__pa(_sdata), SECTION_SIZE);
-#else
-       phys_addr_t kernel_x_start = round_down(__pa(_stext), SECTION_SIZE);
-#endif
+       phys_addr_t kernel_x_start = round_down(__pa(KERNEL_START), SECTION_SIZE);
        phys_addr_t kernel_x_end = round_up(__pa(__init_end), SECTION_SIZE);
 
        /* Map all the lowmem memory banks. */
index 2740967727e2057ef8897e4d4335d38acaeaac2c..3b5c7aaf9c76c522f8c6cbd7890c5105b3a3e1d4 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/kernel.h>
 
 #include <asm/cacheflush.h>
+#include <asm/cp15.h>
 #include <asm/sections.h>
 #include <asm/page.h>
 #include <asm/setup.h>
@@ -22,6 +23,8 @@
 
 #include "mm.h"
 
+unsigned long vectors_base;
+
 #ifdef CONFIG_ARM_MPU
 struct mpu_rgn_info mpu_rgn_info;
 
@@ -85,7 +88,7 @@ static unsigned long irbar_read(void)
 }
 
 /* MPU initialisation functions */
-void __init sanity_check_meminfo_mpu(void)
+void __init adjust_lowmem_bounds_mpu(void)
 {
        phys_addr_t phys_offset = PHYS_OFFSET;
        phys_addr_t aligned_region_size, specified_mem_size, rounded_mem_size;
@@ -274,19 +277,64 @@ void __init mpu_setup(void)
        }
 }
 #else
-static void sanity_check_meminfo_mpu(void) {}
+static void adjust_lowmem_bounds_mpu(void) {}
 static void __init mpu_setup(void) {}
 #endif /* CONFIG_ARM_MPU */
 
+#ifdef CONFIG_CPU_CP15
+#ifdef CONFIG_CPU_HIGH_VECTOR
+static unsigned long __init setup_vectors_base(void)
+{
+       unsigned long reg = get_cr();
+
+       set_cr(reg | CR_V);
+       return 0xffff0000;
+}
+#else /* CONFIG_CPU_HIGH_VECTOR */
+/* Write exception base address to VBAR */
+static inline void set_vbar(unsigned long val)
+{
+       asm("mcr p15, 0, %0, c12, c0, 0" : : "r" (val) : "cc");
+}
+
+/*
+ * Security extensions, bits[7:4], permitted values,
+ * 0b0000 - not implemented, 0b0001/0b0010 - implemented
+ */
+static inline bool security_extensions_enabled(void)
+{
+       return !!cpuid_feature_extract(CPUID_EXT_PFR1, 4);
+}
+
+static unsigned long __init setup_vectors_base(void)
+{
+       unsigned long base = 0, reg = get_cr();
+
+       set_cr(reg & ~CR_V);
+       if (security_extensions_enabled()) {
+               if (IS_ENABLED(CONFIG_REMAP_VECTORS_TO_RAM))
+                       base = CONFIG_DRAM_BASE;
+               set_vbar(base);
+       } else if (IS_ENABLED(CONFIG_REMAP_VECTORS_TO_RAM)) {
+               if (CONFIG_DRAM_BASE != 0)
+                       pr_err("Security extensions not enabled, vectors cannot be remapped to RAM, vectors base will be 0x00000000\n");
+       }
+
+       return base;
+}
+#endif /* CONFIG_CPU_HIGH_VECTOR */
+#endif /* CONFIG_CPU_CP15 */
+
 void __init arm_mm_memblock_reserve(void)
 {
 #ifndef CONFIG_CPU_V7M
+       vectors_base = IS_ENABLED(CONFIG_CPU_CP15) ? setup_vectors_base() : 0;
        /*
         * Register the exception vector page.
         * some architectures which the DRAM is the exception vector to trap,
         * alloc_page breaks with error, although it is not NULL, but "0."
         */
-       memblock_reserve(CONFIG_VECTORS_BASE, 2 * PAGE_SIZE);
+       memblock_reserve(vectors_base, 2 * PAGE_SIZE);
 #else /* ifndef CONFIG_CPU_V7M */
        /*
         * There is no dedicated vector page on V7-M. So nothing needs to be
@@ -295,10 +343,10 @@ void __init arm_mm_memblock_reserve(void)
 #endif
 }
 
-void __init sanity_check_meminfo(void)
+void __init adjust_lowmem_bounds(void)
 {
        phys_addr_t end;
-       sanity_check_meminfo_mpu();
+       adjust_lowmem_bounds_mpu();
        end = memblock_end_of_DRAM();
        high_memory = __va(end - 1) + 1;
        memblock_set_current_limit(end);
@@ -310,7 +358,7 @@ void __init sanity_check_meminfo(void)
  */
 void __init paging_init(const struct machine_desc *mdesc)
 {
-       early_trap_init((void *)CONFIG_VECTORS_BASE);
+       early_trap_init((void *)vectors_base);
        mpu_setup();
        bootmem_init();
 }
diff --git a/arch/arm/mm/physaddr.c b/arch/arm/mm/physaddr.c
new file mode 100644 (file)
index 0000000..02e60f4
--- /dev/null
@@ -0,0 +1,57 @@
+#include <linux/bug.h>
+#include <linux/export.h>
+#include <linux/types.h>
+#include <linux/mmdebug.h>
+#include <linux/mm.h>
+
+#include <asm/sections.h>
+#include <asm/memory.h>
+#include <asm/fixmap.h>
+#include <asm/dma.h>
+
+#include "mm.h"
+
+static inline bool __virt_addr_valid(unsigned long x)
+{
+       /*
+        * high_memory does not get immediately defined, and there
+        * are early callers of __pa() against PAGE_OFFSET
+        */
+       if (!high_memory && x >= PAGE_OFFSET)
+               return true;
+
+       if (high_memory && x >= PAGE_OFFSET && x < (unsigned long)high_memory)
+               return true;
+
+       /*
+        * MAX_DMA_ADDRESS is a virtual address that may not correspond to an
+        * actual physical address. Enough code relies on __pa(MAX_DMA_ADDRESS)
+        * that we just need to work around it and always return true.
+        */
+       if (x == MAX_DMA_ADDRESS)
+               return true;
+
+       return false;
+}
+
+phys_addr_t __virt_to_phys(unsigned long x)
+{
+       WARN(!__virt_addr_valid(x),
+            "virt_to_phys used for non-linear address: %pK (%pS)\n",
+            (void *)x, (void *)x);
+
+       return __virt_to_phys_nodebug(x);
+}
+EXPORT_SYMBOL(__virt_to_phys);
+
+phys_addr_t __phys_addr_symbol(unsigned long x)
+{
+       /* This is bounds checking against the kernel image only.
+        * __pa_symbol should only be used on kernel symbol addresses.
+        */
+       VIRTUAL_BUG_ON(x < (unsigned long)KERNEL_START ||
+                      x > (unsigned long)KERNEL_END);
+
+       return __pa_symbol_nodebug(x);
+}
+EXPORT_SYMBOL(__phys_addr_symbol);
index 8567c851172c78535e5d6b2fee9c96003155169d..4261b3282ad99dd87799683e33b2945bcfb20746 100644 (file)
@@ -1865,14 +1865,14 @@ static void __smp_spurious_interrupt(u8 vector)
                "should never happen.\n", vector, smp_processor_id());
 }
 
-__visible void smp_spurious_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs)
 {
        entering_irq();
        __smp_spurious_interrupt(~regs->orig_ax);
        exiting_irq();
 }
 
-__visible void smp_trace_spurious_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs)
 {
        u8 vector = ~regs->orig_ax;
 
@@ -1923,14 +1923,14 @@ static void __smp_error_interrupt(struct pt_regs *regs)
 
 }
 
-__visible void smp_error_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs)
 {
        entering_irq();
        __smp_error_interrupt(regs);
        exiting_irq();
 }
 
-__visible void smp_trace_error_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_trace_error_interrupt(struct pt_regs *regs)
 {
        entering_irq();
        trace_error_apic_entry(ERROR_APIC_VECTOR);
index 5d30c5e42bb13939b9164ac575d9202a26d772a7..f3557a1eb562fbe6e46b2e3db3289ca8535b1b6c 100644 (file)
@@ -559,7 +559,7 @@ void send_cleanup_vector(struct irq_cfg *cfg)
                __send_cleanup_vector(data);
 }
 
-asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
+asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void)
 {
        unsigned vector, me;
 
index 9e5427df3243430a752e4f1425d906de45a458e7..524cc5780a779630d3203d834b0a508097340c67 100644 (file)
@@ -816,14 +816,14 @@ static inline void __smp_deferred_error_interrupt(void)
        deferred_error_int_vector();
 }
 
-asmlinkage __visible void smp_deferred_error_interrupt(void)
+asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(void)
 {
        entering_irq();
        __smp_deferred_error_interrupt();
        exiting_ack_irq();
 }
 
-asmlinkage __visible void smp_trace_deferred_error_interrupt(void)
+asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void)
 {
        entering_irq();
        trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
index 85469f84c9214027aab98273ac952a94291173b1..d7cc190ae45719bf84d721e781dc7fde00c85e14 100644 (file)
@@ -396,14 +396,16 @@ static inline void __smp_thermal_interrupt(void)
        smp_thermal_vector();
 }
 
-asmlinkage __visible void smp_thermal_interrupt(struct pt_regs *regs)
+asmlinkage __visible void __irq_entry
+smp_thermal_interrupt(struct pt_regs *regs)
 {
        entering_irq();
        __smp_thermal_interrupt();
        exiting_ack_irq();
 }
 
-asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs)
+asmlinkage __visible void __irq_entry
+smp_trace_thermal_interrupt(struct pt_regs *regs)
 {
        entering_irq();
        trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
index 9beb092d68a514b7572ae7c906084a80411dbef8..bb0e75eed10a10a1b19daf82eeef12aefd8be2db 100644 (file)
@@ -23,14 +23,14 @@ static inline void __smp_threshold_interrupt(void)
        mce_threshold_vector();
 }
 
-asmlinkage __visible void smp_threshold_interrupt(void)
+asmlinkage __visible void __irq_entry smp_threshold_interrupt(void)
 {
        entering_irq();
        __smp_threshold_interrupt();
        exiting_ack_irq();
 }
 
-asmlinkage __visible void smp_trace_threshold_interrupt(void)
+asmlinkage __visible void __irq_entry smp_trace_threshold_interrupt(void)
 {
        entering_irq();
        trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
index 7c6e9ffe4424d9d4cc8c88b3ed1166dd299409e1..4d8183b5f11323789f133de97bd32a6c8c403524 100644 (file)
@@ -264,7 +264,7 @@ void __smp_x86_platform_ipi(void)
                x86_platform_ipi_callback();
 }
 
-__visible void smp_x86_platform_ipi(struct pt_regs *regs)
+__visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs)
 {
        struct pt_regs *old_regs = set_irq_regs(regs);
 
@@ -315,7 +315,7 @@ __visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs)
 }
 #endif
 
-__visible void smp_trace_x86_platform_ipi(struct pt_regs *regs)
+__visible void __irq_entry smp_trace_x86_platform_ipi(struct pt_regs *regs)
 {
        struct pt_regs *old_regs = set_irq_regs(regs);
 
index 3512ba607361403e587f417cbce2775cdec428a1..275487872be2b35e7c6d01335fb08170b2cf7fa0 100644 (file)
@@ -9,6 +9,7 @@
 #include <linux/hardirq.h>
 #include <asm/apic.h>
 #include <asm/trace/irq_vectors.h>
+#include <linux/interrupt.h>
 
 static inline void __smp_irq_work_interrupt(void)
 {
@@ -16,14 +17,14 @@ static inline void __smp_irq_work_interrupt(void)
        irq_work_run();
 }
 
-__visible void smp_irq_work_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs)
 {
        ipi_entering_ack_irq();
        __smp_irq_work_interrupt();
        exiting_irq();
 }
 
-__visible void smp_trace_irq_work_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_trace_irq_work_interrupt(struct pt_regs *regs)
 {
        ipi_entering_ack_irq();
        trace_irq_work_entry(IRQ_WORK_VECTOR);
index 69780edf0dde90c8c5b04211525dcd1137d73e73..4bf0c8926a1c061bb22c1973962a0d668bfa3d98 100644 (file)
@@ -575,7 +575,9 @@ static void __init reserve_crashkernel(void)
        /* 0 means: find the address automatically */
        if (crash_base <= 0) {
                /*
-                *  kexec want bzImage is below CRASH_KERNEL_ADDR_MAX
+                * Set CRASH_ADDR_LOW_MAX upper bound for crash memory,
+                * as old kexec-tools loads bzImage below that, unless
+                * "crashkernel=size[KMG],high" is specified.
                 */
                crash_base = memblock_find_in_range(CRASH_ALIGN,
                                                    high ? CRASH_ADDR_HIGH_MAX
index 68f8cc222f255aa1cf5266e2d84d7ceeb2417977..d3c66a15bbde00e254461922a5a0cb8f1018c166 100644 (file)
@@ -259,7 +259,7 @@ static inline void __smp_reschedule_interrupt(void)
        scheduler_ipi();
 }
 
-__visible void smp_reschedule_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs)
 {
        ack_APIC_irq();
        __smp_reschedule_interrupt();
@@ -268,7 +268,7 @@ __visible void smp_reschedule_interrupt(struct pt_regs *regs)
         */
 }
 
-__visible void smp_trace_reschedule_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_trace_reschedule_interrupt(struct pt_regs *regs)
 {
        /*
         * Need to call irq_enter() before calling the trace point.
@@ -292,14 +292,15 @@ static inline void __smp_call_function_interrupt(void)
        inc_irq_stat(irq_call_count);
 }
 
-__visible void smp_call_function_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_call_function_interrupt(struct pt_regs *regs)
 {
        ipi_entering_ack_irq();
        __smp_call_function_interrupt();
        exiting_irq();
 }
 
-__visible void smp_trace_call_function_interrupt(struct pt_regs *regs)
+__visible void __irq_entry
+smp_trace_call_function_interrupt(struct pt_regs *regs)
 {
        ipi_entering_ack_irq();
        trace_call_function_entry(CALL_FUNCTION_VECTOR);
@@ -314,14 +315,16 @@ static inline void __smp_call_function_single_interrupt(void)
        inc_irq_stat(irq_call_count);
 }
 
-__visible void smp_call_function_single_interrupt(struct pt_regs *regs)
+__visible void __irq_entry
+smp_call_function_single_interrupt(struct pt_regs *regs)
 {
        ipi_entering_ack_irq();
        __smp_call_function_single_interrupt();
        exiting_irq();
 }
 
-__visible void smp_trace_call_function_single_interrupt(struct pt_regs *regs)
+__visible void __irq_entry
+smp_trace_call_function_single_interrupt(struct pt_regs *regs)
 {
        ipi_entering_ack_irq();
        trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
index e79f15f108a8d43d8d5644f7fc967f32c3282b47..ad0118fbce90d79e80211a923fced106c39f9316 100644 (file)
@@ -346,6 +346,7 @@ SECTIONS
        /DISCARD/ : {
                *(.eh_frame)
                *(__func_stack_frame_non_standard)
+               *(__unreachable)
        }
 }
 
index 362cecc77130260459d81d18d8853f39a7eb35eb..4d680772379828423d8605b1cae8c5da271ec5b8 100644 (file)
@@ -123,9 +123,11 @@ static int atomic_dec_return_safe(atomic_t *v)
 #define RBD_FEATURE_LAYERING   (1<<0)
 #define RBD_FEATURE_STRIPINGV2 (1<<1)
 #define RBD_FEATURE_EXCLUSIVE_LOCK (1<<2)
+#define RBD_FEATURE_DATA_POOL (1<<7)
 #define RBD_FEATURES_ALL       (RBD_FEATURE_LAYERING |         \
                                 RBD_FEATURE_STRIPINGV2 |       \
-                                RBD_FEATURE_EXCLUSIVE_LOCK)
+                                RBD_FEATURE_EXCLUSIVE_LOCK |   \
+                                RBD_FEATURE_DATA_POOL)
 
 /* Features supported by this (client software) implementation. */
 
@@ -144,10 +146,9 @@ struct rbd_image_header {
        /* These six fields never change for a given rbd image */
        char *object_prefix;
        __u8 obj_order;
-       __u8 crypt_type;
-       __u8 comp_type;
        u64 stripe_unit;
        u64 stripe_count;
+       s64 data_pool_id;
        u64 features;           /* Might be changeable someday? */
 
        /* The remaining fields need to be updated occasionally */
@@ -230,7 +231,7 @@ enum obj_req_flags {
 };
 
 struct rbd_obj_request {
-       const char              *object_name;
+       u64                     object_no;
        u64                     offset;         /* object start byte */
        u64                     length;         /* bytes from offset */
        unsigned long           flags;
@@ -438,7 +439,6 @@ static DEFINE_SPINLOCK(rbd_client_list_lock);
 
 static struct kmem_cache       *rbd_img_request_cache;
 static struct kmem_cache       *rbd_obj_request_cache;
-static struct kmem_cache       *rbd_segment_name_cache;
 
 static int rbd_major;
 static DEFINE_IDA(rbd_dev_id_ida);
@@ -972,6 +972,30 @@ static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
        return true;
 }
 
+/*
+ * returns the size of an object in the image
+ */
+static u32 rbd_obj_bytes(struct rbd_image_header *header)
+{
+       return 1U << header->obj_order;
+}
+
+static void rbd_init_layout(struct rbd_device *rbd_dev)
+{
+       if (rbd_dev->header.stripe_unit == 0 ||
+           rbd_dev->header.stripe_count == 0) {
+               rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
+               rbd_dev->header.stripe_count = 1;
+       }
+
+       rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
+       rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
+       rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
+       rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
+                         rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
+       RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
+}
+
 /*
  * Fill an rbd image header with information from the given format 1
  * on-disk header.
@@ -992,15 +1016,11 @@ static int rbd_header_from_disk(struct rbd_device *rbd_dev,
        /* Allocate this now to avoid having to handle failure below */
 
        if (first_time) {
-               size_t len;
-
-               len = strnlen(ondisk->object_prefix,
-                               sizeof (ondisk->object_prefix));
-               object_prefix = kmalloc(len + 1, GFP_KERNEL);
+               object_prefix = kstrndup(ondisk->object_prefix,
+                                        sizeof(ondisk->object_prefix),
+                                        GFP_KERNEL);
                if (!object_prefix)
                        return -ENOMEM;
-               memcpy(object_prefix, ondisk->object_prefix, len);
-               object_prefix[len] = '\0';
        }
 
        /* Allocate the snapshot context and fill it in */
@@ -1051,12 +1071,7 @@ static int rbd_header_from_disk(struct rbd_device *rbd_dev,
        if (first_time) {
                header->object_prefix = object_prefix;
                header->obj_order = ondisk->options.order;
-               header->crypt_type = ondisk->options.crypt_type;
-               header->comp_type = ondisk->options.comp_type;
-               /* The rest aren't used for format 1 images */
-               header->stripe_unit = 0;
-               header->stripe_count = 0;
-               header->features = 0;
+               rbd_init_layout(rbd_dev);
        } else {
                ceph_put_snap_context(header->snapc);
                kfree(header->snap_names);
@@ -1232,42 +1247,9 @@ static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
        rbd_dev->mapping.features = 0;
 }
 
-static void rbd_segment_name_free(const char *name)
-{
-       /* The explicit cast here is needed to drop the const qualifier */
-
-       kmem_cache_free(rbd_segment_name_cache, (void *)name);
-}
-
-static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
-{
-       char *name;
-       u64 segment;
-       int ret;
-       char *name_format;
-
-       name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
-       if (!name)
-               return NULL;
-       segment = offset >> rbd_dev->header.obj_order;
-       name_format = "%s.%012llx";
-       if (rbd_dev->image_format == 2)
-               name_format = "%s.%016llx";
-       ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format,
-                       rbd_dev->header.object_prefix, segment);
-       if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
-               pr_err("error formatting segment name for #%llu (%d)\n",
-                       segment, ret);
-               rbd_segment_name_free(name);
-               name = NULL;
-       }
-
-       return name;
-}
-
 static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
 {
-       u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
+       u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
 
        return offset & (segment_size - 1);
 }
@@ -1275,7 +1257,7 @@ static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
 static u64 rbd_segment_length(struct rbd_device *rbd_dev,
                                u64 offset, u64 length)
 {
-       u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
+       u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
 
        offset &= segment_size - 1;
 
@@ -1286,14 +1268,6 @@ static u64 rbd_segment_length(struct rbd_device *rbd_dev,
        return length;
 }
 
-/*
- * returns the size of an object in the image
- */
-static u64 rbd_obj_bytes(struct rbd_image_header *header)
-{
-       return 1 << header->obj_order;
-}
-
 /*
  * bio helpers
  */
@@ -1623,7 +1597,9 @@ static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
 {
        struct ceph_osd_request *osd_req = obj_request->osd_req;
 
-       dout("%s %p osd_req %p\n", __func__, obj_request, osd_req);
+       dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__,
+            obj_request, obj_request->object_no, obj_request->offset,
+            obj_request->length, osd_req);
        if (obj_request_img_data_test(obj_request)) {
                WARN_ON(obj_request->callback != rbd_img_obj_callback);
                rbd_img_request_get(obj_request->img_request);
@@ -1631,44 +1607,6 @@ static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
        ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
 }
 
-static void rbd_obj_request_end(struct rbd_obj_request *obj_request)
-{
-       dout("%s %p\n", __func__, obj_request);
-       ceph_osdc_cancel_request(obj_request->osd_req);
-}
-
-/*
- * Wait for an object request to complete.  If interrupted, cancel the
- * underlying osd request.
- *
- * @timeout: in jiffies, 0 means "wait forever"
- */
-static int __rbd_obj_request_wait(struct rbd_obj_request *obj_request,
-                                 unsigned long timeout)
-{
-       long ret;
-
-       dout("%s %p\n", __func__, obj_request);
-       ret = wait_for_completion_interruptible_timeout(
-                                       &obj_request->completion,
-                                       ceph_timeout_jiffies(timeout));
-       if (ret <= 0) {
-               if (ret == 0)
-                       ret = -ETIMEDOUT;
-               rbd_obj_request_end(obj_request);
-       } else {
-               ret = 0;
-       }
-
-       dout("%s %p ret %d\n", __func__, obj_request, (int)ret);
-       return ret;
-}
-
-static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
-{
-       return __rbd_obj_request_wait(obj_request, 0);
-}
-
 static void rbd_img_request_complete(struct rbd_img_request *img_request)
 {
 
@@ -1955,8 +1893,8 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
                rbd_osd_call_callback(obj_request);
                break;
        default:
-               rbd_warn(NULL, "%s: unsupported op %hu",
-                       obj_request->object_name, (unsigned short) opcode);
+               rbd_warn(NULL, "unexpected OSD op: object_no %016llx opcode %d",
+                        obj_request->object_no, opcode);
                break;
        }
 
@@ -1980,6 +1918,40 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
        osd_req->r_data_offset = obj_request->offset;
 }
 
+static struct ceph_osd_request *
+__rbd_osd_req_create(struct rbd_device *rbd_dev,
+                    struct ceph_snap_context *snapc,
+                    int num_ops, unsigned int flags,
+                    struct rbd_obj_request *obj_request)
+{
+       struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+       struct ceph_osd_request *req;
+       const char *name_format = rbd_dev->image_format == 1 ?
+                                     RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
+
+       req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
+       if (!req)
+               return NULL;
+
+       req->r_flags = flags;
+       req->r_callback = rbd_osd_req_callback;
+       req->r_priv = obj_request;
+
+       req->r_base_oloc.pool = rbd_dev->layout.pool_id;
+       if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
+                       rbd_dev->header.object_prefix, obj_request->object_no))
+               goto err_req;
+
+       if (ceph_osdc_alloc_messages(req, GFP_NOIO))
+               goto err_req;
+
+       return req;
+
+err_req:
+       ceph_osdc_put_request(req);
+       return NULL;
+}
+
 /*
  * Create an osd request.  A read request has one osd op (read).
  * A write request has either one (watch) or two (hint+write) osd ops.
@@ -1993,8 +1965,6 @@ static struct ceph_osd_request *rbd_osd_req_create(
                                        struct rbd_obj_request *obj_request)
 {
        struct ceph_snap_context *snapc = NULL;
-       struct ceph_osd_client *osdc;
-       struct ceph_osd_request *osd_req;
 
        if (obj_request_img_data_test(obj_request) &&
                (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) {
@@ -2009,35 +1979,9 @@ static struct ceph_osd_request *rbd_osd_req_create(
 
        rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2));
 
-       /* Allocate and initialize the request, for the num_ops ops */
-
-       osdc = &rbd_dev->rbd_client->client->osdc;
-       osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
-                                         GFP_NOIO);
-       if (!osd_req)
-               goto fail;
-
-       if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
-               osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
-       else
-               osd_req->r_flags = CEPH_OSD_FLAG_READ;
-
-       osd_req->r_callback = rbd_osd_req_callback;
-       osd_req->r_priv = obj_request;
-
-       osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id;
-       if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
-                            obj_request->object_name))
-               goto fail;
-
-       if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
-               goto fail;
-
-       return osd_req;
-
-fail:
-       ceph_osdc_put_request(osd_req);
-       return NULL;
+       return __rbd_osd_req_create(rbd_dev, snapc, num_ops,
+           (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) ?
+           CEPH_OSD_FLAG_WRITE : CEPH_OSD_FLAG_READ, obj_request);
 }
 
 /*
@@ -2050,10 +1994,6 @@ static struct ceph_osd_request *
 rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
 {
        struct rbd_img_request *img_request;
-       struct ceph_snap_context *snapc;
-       struct rbd_device *rbd_dev;
-       struct ceph_osd_client *osdc;
-       struct ceph_osd_request *osd_req;
        int num_osd_ops = 3;
 
        rbd_assert(obj_request_img_data_test(obj_request));
@@ -2065,77 +2005,34 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
        if (img_request_discard_test(img_request))
                num_osd_ops = 2;
 
-       /* Allocate and initialize the request, for all the ops */
-
-       snapc = img_request->snapc;
-       rbd_dev = img_request->rbd_dev;
-       osdc = &rbd_dev->rbd_client->client->osdc;
-       osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops,
-                                               false, GFP_NOIO);
-       if (!osd_req)
-               goto fail;
-
-       osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
-       osd_req->r_callback = rbd_osd_req_callback;
-       osd_req->r_priv = obj_request;
-
-       osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id;
-       if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
-                            obj_request->object_name))
-               goto fail;
-
-       if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
-               goto fail;
-
-       return osd_req;
-
-fail:
-       ceph_osdc_put_request(osd_req);
-       return NULL;
+       return __rbd_osd_req_create(img_request->rbd_dev,
+                                   img_request->snapc, num_osd_ops,
+                                   CEPH_OSD_FLAG_WRITE, obj_request);
 }
 
-
 static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
 {
        ceph_osdc_put_request(osd_req);
 }
 
-/* object_name is assumed to be a non-null pointer and NUL-terminated */
-
-static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
-                                               u64 offset, u64 length,
-                                               enum obj_request_type type)
+static struct rbd_obj_request *
+rbd_obj_request_create(enum obj_request_type type)
 {
        struct rbd_obj_request *obj_request;
-       size_t size;
-       char *name;
 
        rbd_assert(obj_request_type_valid(type));
 
-       size = strlen(object_name) + 1;
-       name = kmalloc(size, GFP_NOIO);
-       if (!name)
-               return NULL;
-
        obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
-       if (!obj_request) {
-               kfree(name);
+       if (!obj_request)
                return NULL;
-       }
 
-       obj_request->object_name = memcpy(name, object_name, size);
-       obj_request->offset = offset;
-       obj_request->length = length;
-       obj_request->flags = 0;
        obj_request->which = BAD_WHICH;
        obj_request->type = type;
        INIT_LIST_HEAD(&obj_request->links);
        init_completion(&obj_request->completion);
        kref_init(&obj_request->kref);
 
-       dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
-               offset, length, (int)type, obj_request);
-
+       dout("%s %p\n", __func__, obj_request);
        return obj_request;
 }
 
@@ -2170,8 +2067,6 @@ static void rbd_obj_request_destroy(struct kref *kref)
                break;
        }
 
-       kfree(obj_request->object_name);
-       obj_request->object_name = NULL;
        kmem_cache_free(rbd_obj_request_cache, obj_request);
 }
 
@@ -2546,22 +2441,18 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
 
        while (resid) {
                struct ceph_osd_request *osd_req;
-               const char *object_name;
-               u64 offset;
-               u64 length;
+               u64 object_no = img_offset >> rbd_dev->header.obj_order;
+               u64 offset = rbd_segment_offset(rbd_dev, img_offset);
+               u64 length = rbd_segment_length(rbd_dev, img_offset, resid);
 
-               object_name = rbd_segment_name(rbd_dev, img_offset);
-               if (!object_name)
-                       goto out_unwind;
-               offset = rbd_segment_offset(rbd_dev, img_offset);
-               length = rbd_segment_length(rbd_dev, img_offset, resid);
-               obj_request = rbd_obj_request_create(object_name,
-                                               offset, length, type);
-               /* object request has its own copy of the object name */
-               rbd_segment_name_free(object_name);
+               obj_request = rbd_obj_request_create(type);
                if (!obj_request)
                        goto out_unwind;
 
+               obj_request->object_no = object_no;
+               obj_request->offset = offset;
+               obj_request->length = length;
+
                /*
                 * set obj_request->img_request before creating the
                 * osd_request so that it gets the right snapc
@@ -2771,7 +2662,7 @@ static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
         * child image to which the original request was to be sent.
         */
        img_offset = obj_request->img_offset - obj_request->offset;
-       length = (u64)1 << rbd_dev->header.obj_order;
+       length = rbd_obj_bytes(&rbd_dev->header);
 
        /*
         * There is no defined parent data beyond the parent
@@ -2900,11 +2791,12 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
        size_t size;
        int ret;
 
-       stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
-                                             OBJ_REQUEST_PAGES);
+       stat_request = rbd_obj_request_create(OBJ_REQUEST_PAGES);
        if (!stat_request)
                return -ENOMEM;
 
+       stat_request->object_no = obj_request->object_no;
+
        stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
                                                   stat_request);
        if (!stat_request->osd_req) {
@@ -3983,17 +3875,17 @@ out:
  * returned in the outbound buffer, or a negative error code.
  */
 static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
-                            const char *object_name,
-                            const char *class_name,
+                            struct ceph_object_id *oid,
+                            struct ceph_object_locator *oloc,
                             const char *method_name,
                             const void *outbound,
                             size_t outbound_size,
                             void *inbound,
                             size_t inbound_size)
 {
-       struct rbd_obj_request *obj_request;
-       struct page **pages;
-       u32 page_count;
+       struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+       struct page *req_page = NULL;
+       struct page *reply_page;
        int ret;
 
        /*
@@ -4003,61 +3895,35 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
         * method.  Currently if this is present it will be a
         * snapshot id.
         */
-       page_count = (u32)calc_pages_for(0, inbound_size);
-       pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
-       if (IS_ERR(pages))
-               return PTR_ERR(pages);
-
-       ret = -ENOMEM;
-       obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
-                                                       OBJ_REQUEST_PAGES);
-       if (!obj_request)
-               goto out;
+       if (outbound) {
+               if (outbound_size > PAGE_SIZE)
+                       return -E2BIG;
 
-       obj_request->pages = pages;
-       obj_request->page_count = page_count;
-
-       obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
-                                                 obj_request);
-       if (!obj_request->osd_req)
-               goto out;
-
-       osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
-                                       class_name, method_name);
-       if (outbound_size) {
-               struct ceph_pagelist *pagelist;
-
-               pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
-               if (!pagelist)
-                       goto out;
+               req_page = alloc_page(GFP_KERNEL);
+               if (!req_page)
+                       return -ENOMEM;
 
-               ceph_pagelist_init(pagelist);
-               ceph_pagelist_append(pagelist, outbound, outbound_size);
-               osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
-                                               pagelist);
+               memcpy(page_address(req_page), outbound, outbound_size);
        }
-       osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
-                                       obj_request->pages, inbound_size,
-                                       0, false, false);
-
-       rbd_obj_request_submit(obj_request);
-       ret = rbd_obj_request_wait(obj_request);
-       if (ret)
-               goto out;
 
-       ret = obj_request->result;
-       if (ret < 0)
-               goto out;
+       reply_page = alloc_page(GFP_KERNEL);
+       if (!reply_page) {
+               if (req_page)
+                       __free_page(req_page);
+               return -ENOMEM;
+       }
 
-       rbd_assert(obj_request->xferred < (u64)INT_MAX);
-       ret = (int)obj_request->xferred;
-       ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
-out:
-       if (obj_request)
-               rbd_obj_request_put(obj_request);
-       else
-               ceph_release_page_vector(pages, page_count);
+       ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
+                            CEPH_OSD_FLAG_READ, req_page, outbound_size,
+                            reply_page, &inbound_size);
+       if (!ret) {
+               memcpy(inbound, page_address(reply_page), inbound_size);
+               ret = inbound_size;
+       }
 
+       if (req_page)
+               __free_page(req_page);
+       __free_page(reply_page);
        return ret;
 }
 
@@ -4256,63 +4122,46 @@ static void rbd_free_disk(struct rbd_device *rbd_dev)
 }
 
 static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
-                               const char *object_name,
-                               u64 offset, u64 length, void *buf)
+                            struct ceph_object_id *oid,
+                            struct ceph_object_locator *oloc,
+                            void *buf, int buf_len)
 
 {
-       struct rbd_obj_request *obj_request;
-       struct page **pages = NULL;
-       u32 page_count;
-       size_t size;
+       struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+       struct ceph_osd_request *req;
+       struct page **pages;
+       int num_pages = calc_pages_for(0, buf_len);
        int ret;
 
-       page_count = (u32) calc_pages_for(offset, length);
-       pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
-       if (IS_ERR(pages))
-               return PTR_ERR(pages);
-
-       ret = -ENOMEM;
-       obj_request = rbd_obj_request_create(object_name, offset, length,
-                                                       OBJ_REQUEST_PAGES);
-       if (!obj_request)
-               goto out;
-
-       obj_request->pages = pages;
-       obj_request->page_count = page_count;
-
-       obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
-                                                 obj_request);
-       if (!obj_request->osd_req)
-               goto out;
+       req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
+       if (!req)
+               return -ENOMEM;
 
-       osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
-                                       offset, length, 0, 0);
-       osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
-                                       obj_request->pages,
-                                       obj_request->length,
-                                       obj_request->offset & ~PAGE_MASK,
-                                       false, false);
+       ceph_oid_copy(&req->r_base_oid, oid);
+       ceph_oloc_copy(&req->r_base_oloc, oloc);
+       req->r_flags = CEPH_OSD_FLAG_READ;
 
-       rbd_obj_request_submit(obj_request);
-       ret = rbd_obj_request_wait(obj_request);
+       ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
        if (ret)
-               goto out;
+               goto out_req;
 
-       ret = obj_request->result;
-       if (ret < 0)
-               goto out;
+       pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+       if (IS_ERR(pages)) {
+               ret = PTR_ERR(pages);
+               goto out_req;
+       }
 
-       rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
-       size = (size_t) obj_request->xferred;
-       ceph_copy_from_page_vector(pages, buf, 0, size);
-       rbd_assert(size <= (size_t)INT_MAX);
-       ret = (int)size;
-out:
-       if (obj_request)
-               rbd_obj_request_put(obj_request);
-       else
-               ceph_release_page_vector(pages, page_count);
+       osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
+       osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
+                                        true);
+
+       ceph_osdc_start_request(osdc, req, false);
+       ret = ceph_osdc_wait_request(osdc, req);
+       if (ret >= 0)
+               ceph_copy_from_page_vector(pages, buf, 0, ret);
 
+out_req:
+       ceph_osdc_put_request(req);
        return ret;
 }
 
@@ -4348,8 +4197,8 @@ static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
                if (!ondisk)
                        return -ENOMEM;
 
-               ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_oid.name,
-                                      0, size, ondisk);
+               ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
+                                       &rbd_dev->header_oloc, ondisk, size);
                if (ret < 0)
                        goto out;
                if ((size_t)ret < size) {
@@ -4781,7 +4630,7 @@ static const struct attribute_group *rbd_attr_groups[] = {
 
 static void rbd_dev_release(struct device *dev);
 
-static struct device_type rbd_device_type = {
+static const struct device_type rbd_device_type = {
        .name           = "rbd",
        .groups         = rbd_attr_groups,
        .release        = rbd_dev_release,
@@ -4876,8 +4725,9 @@ static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
        INIT_LIST_HEAD(&rbd_dev->node);
        init_rwsem(&rbd_dev->header_rwsem);
 
+       rbd_dev->header.data_pool_id = CEPH_NOPOOL;
        ceph_oid_init(&rbd_dev->header_oid);
-       ceph_oloc_init(&rbd_dev->header_oloc);
+       rbd_dev->header_oloc.pool = spec->pool_id;
 
        mutex_init(&rbd_dev->watch_mutex);
        rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
@@ -4899,12 +4749,6 @@ static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
        rbd_dev->rbd_client = rbdc;
        rbd_dev->spec = spec;
 
-       rbd_dev->layout.stripe_unit = 1 << RBD_MAX_OBJ_ORDER;
-       rbd_dev->layout.stripe_count = 1;
-       rbd_dev->layout.object_size = 1 << RBD_MAX_OBJ_ORDER;
-       rbd_dev->layout.pool_id = spec->pool_id;
-       RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
-
        return rbd_dev;
 }
 
@@ -4970,10 +4814,10 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
                __le64 size;
        } __attribute__ ((packed)) size_buf = { 0 };
 
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
-                               "rbd", "get_size",
-                               &snapid, sizeof (snapid),
-                               &size_buf, sizeof (size_buf));
+       ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+                                 &rbd_dev->header_oloc, "get_size",
+                                 &snapid, sizeof(snapid),
+                                 &size_buf, sizeof(size_buf));
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
        if (ret < 0)
                return ret;
@@ -5010,9 +4854,9 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
        if (!reply_buf)
                return -ENOMEM;
 
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
-                               "rbd", "get_object_prefix", NULL, 0,
-                               reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
+       ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+                                 &rbd_dev->header_oloc, "get_object_prefix",
+                                 NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
        if (ret < 0)
                goto out;
@@ -5045,10 +4889,10 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
        u64 unsup;
        int ret;
 
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
-                               "rbd", "get_features",
-                               &snapid, sizeof (snapid),
-                               &features_buf, sizeof (features_buf));
+       ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+                                 &rbd_dev->header_oloc, "get_features",
+                                 &snapid, sizeof(snapid),
+                                 &features_buf, sizeof(features_buf));
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
        if (ret < 0)
                return ret;
@@ -5107,10 +4951,9 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
        }
 
        snapid = cpu_to_le64(rbd_dev->spec->snap_id);
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
-                               "rbd", "get_parent",
-                               &snapid, sizeof (snapid),
-                               reply_buf, size);
+       ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+                                 &rbd_dev->header_oloc, "get_parent",
+                                 &snapid, sizeof(snapid), reply_buf, size);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
        if (ret < 0)
                goto out_err;
@@ -5210,9 +5053,9 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
        u64 stripe_count;
        int ret;
 
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
-                               "rbd", "get_stripe_unit_count", NULL, 0,
-                               (char *)&striping_info_buf, size);
+       ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+                               &rbd_dev->header_oloc, "get_stripe_unit_count",
+                               NULL, 0, &striping_info_buf, size);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
        if (ret < 0)
                return ret;
@@ -5226,7 +5069,7 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
         * out, and only fail if the image has non-default values.
         */
        ret = -EINVAL;
-       obj_size = (u64)1 << rbd_dev->header.obj_order;
+       obj_size = rbd_obj_bytes(&rbd_dev->header);
        p = &striping_info_buf;
        stripe_unit = ceph_decode_64(&p);
        if (stripe_unit != obj_size) {
@@ -5247,8 +5090,27 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
        return 0;
 }
 
+static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
+{
+       __le64 data_pool_id;
+       int ret;
+
+       ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+                                 &rbd_dev->header_oloc, "get_data_pool",
+                                 NULL, 0, &data_pool_id, sizeof(data_pool_id));
+       if (ret < 0)
+               return ret;
+       if (ret < sizeof(data_pool_id))
+               return -EBADMSG;
+
+       rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
+       WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
+       return 0;
+}
+
 static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
 {
+       CEPH_DEFINE_OID_ONSTACK(oid);
        size_t image_id_size;
        char *image_id;
        void *p;
@@ -5276,10 +5138,10 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
        if (!reply_buf)
                goto out;
 
-       ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
-                               "rbd", "dir_get_name",
-                               image_id, image_id_size,
-                               reply_buf, size);
+       ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
+       ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
+                                 "dir_get_name", image_id, image_id_size,
+                                 reply_buf, size);
        if (ret < 0)
                goto out;
        p = reply_buf;
@@ -5458,9 +5320,9 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
        if (!reply_buf)
                return -ENOMEM;
 
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
-                               "rbd", "get_snapcontext", NULL, 0,
-                               reply_buf, size);
+       ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+                                 &rbd_dev->header_oloc, "get_snapcontext",
+                                 NULL, 0, reply_buf, size);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
        if (ret < 0)
                goto out;
@@ -5523,10 +5385,9 @@ static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
                return ERR_PTR(-ENOMEM);
 
        snapid = cpu_to_le64(snap_id);
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
-                               "rbd", "get_snapshot_name",
-                               &snapid, sizeof (snapid),
-                               reply_buf, size);
+       ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+                                 &rbd_dev->header_oloc, "get_snapshot_name",
+                                 &snapid, sizeof(snapid), reply_buf, size);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
        if (ret < 0) {
                snap_name = ERR_PTR(ret);
@@ -5833,7 +5694,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
 {
        int ret;
        size_t size;
-       char *object_name;
+       CEPH_DEFINE_OID_ONSTACK(oid);
        void *response;
        char *image_id;
 
@@ -5853,12 +5714,12 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
         * First, see if the format 2 image id file exists, and if
         * so, get the image's persistent id from it.
         */
-       size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
-       object_name = kmalloc(size, GFP_NOIO);
-       if (!object_name)
-               return -ENOMEM;
-       sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
-       dout("rbd id object name is %s\n", object_name);
+       ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
+                              rbd_dev->spec->image_name);
+       if (ret)
+               return ret;
+
+       dout("rbd id object name is %s\n", oid.name);
 
        /* Response will be an encoded string, which includes a length */
 
@@ -5871,9 +5732,9 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
 
        /* If it doesn't exist we'll assume it's a format 1 image */
 
-       ret = rbd_obj_method_sync(rbd_dev, object_name,
-                               "rbd", "get_id", NULL, 0,
-                               response, RBD_IMAGE_ID_LEN_MAX);
+       ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
+                                 "get_id", NULL, 0,
+                                 response, RBD_IMAGE_ID_LEN_MAX);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
        if (ret == -ENOENT) {
                image_id = kstrdup("", GFP_KERNEL);
@@ -5896,8 +5757,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
        }
 out:
        kfree(response);
-       kfree(object_name);
-
+       ceph_oid_destroy(&oid);
        return ret;
 }
 
@@ -5944,14 +5804,20 @@ static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
                if (ret < 0)
                        goto out_err;
        }
-       /* No support for crypto and compression type format 2 images */
 
+       if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
+               ret = rbd_dev_v2_data_pool(rbd_dev);
+               if (ret)
+                       goto out_err;
+       }
+
+       rbd_init_layout(rbd_dev);
        return 0;
+
 out_err:
        rbd_dev->header.features = 0;
        kfree(rbd_dev->header.object_prefix);
        rbd_dev->header.object_prefix = NULL;
-
        return ret;
 }
 
@@ -6077,8 +5943,6 @@ static int rbd_dev_header_name(struct rbd_device *rbd_dev)
        /* Record the header object name for this rbd image. */
 
        rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
-
-       rbd_dev->header_oloc.pool = rbd_dev->layout.pool_id;
        if (rbd_dev->image_format == 1)
                ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
                                       spec->image_name, RBD_SUFFIX);
@@ -6471,27 +6335,16 @@ static int rbd_slab_init(void)
        if (!rbd_obj_request_cache)
                goto out_err;
 
-       rbd_assert(!rbd_segment_name_cache);
-       rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
-                                       CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL);
-       if (rbd_segment_name_cache)
-               return 0;
-out_err:
-       kmem_cache_destroy(rbd_obj_request_cache);
-       rbd_obj_request_cache = NULL;
+       return 0;
 
+out_err:
        kmem_cache_destroy(rbd_img_request_cache);
        rbd_img_request_cache = NULL;
-
        return -ENOMEM;
 }
 
 static void rbd_slab_exit(void)
 {
-       rbd_assert(rbd_segment_name_cache);
-       kmem_cache_destroy(rbd_segment_name_cache);
-       rbd_segment_name_cache = NULL;
-
        rbd_assert(rbd_obj_request_cache);
        kmem_cache_destroy(rbd_obj_request_cache);
        rbd_obj_request_cache = NULL;
index 94f367db27b0b816e9585da18f0063a1523b23ca..62ff50d3e7a6f1f4da2d10249512f9c2cff1859f 100644 (file)
@@ -25,8 +25,8 @@
  */
 
 #define RBD_HEADER_PREFIX      "rbd_header."
-#define RBD_DATA_PREFIX        "rbd_data."
 #define RBD_ID_PREFIX          "rbd_id."
+#define RBD_V2_DATA_FORMAT     "%s.%016llx"
 
 #define RBD_LOCK_NAME          "rbd_lock"
 #define RBD_LOCK_TAG           "internal"
@@ -42,13 +42,14 @@ enum rbd_notify_op {
 /*
  * For format version 1, rbd image 'foo' consists of objects
  *   foo.rbd           - image metadata
- *   rb.<idhi>.<idlo>.00000000
- *   rb.<idhi>.<idlo>.00000001
+ *   rb.<idhi>.<idlo>.<extra>.000000000000
+ *   rb.<idhi>.<idlo>.<extra>.000000000001
  *   ...               - data
  * There is no notion of a persistent image id in rbd format 1.
  */
 
 #define RBD_SUFFIX             ".rbd"
+#define RBD_V1_DATA_FORMAT     "%s.%012llx"
 
 #define RBD_DIRECTORY           "rbd_directory"
 #define RBD_INFO                "rbd_info"
@@ -57,9 +58,6 @@ enum rbd_notify_op {
 #define RBD_MIN_OBJ_ORDER       16
 #define RBD_MAX_OBJ_ORDER       30
 
-#define RBD_COMP_NONE          0
-#define RBD_CRYPT_NONE         0
-
 #define RBD_HEADER_TEXT                "<<< Rados Block Device Image >>>\n"
 #define RBD_HEADER_SIGNATURE   "RBD"
 #define RBD_HEADER_VERSION     "001.005"
index 46427ea01753b4c84f9670939347f06cc65bbfbe..157f2d1fb7e1908f527c2127e707199f9364593d 100644 (file)
@@ -300,7 +300,7 @@ static const struct ide_port_ops palm_bk3710_ports_ops = {
        .cable_detect           = palm_bk3710_cable_detect,
 };
 
-static struct ide_port_info palm_bk3710_port_info = {
+static struct ide_port_info palm_bk3710_port_info __initdata = {
        .init_dma               = palm_bk3710_init_dma,
        .port_ops               = &palm_bk3710_ports_ops,
        .dma_ops                = &sff_dma_ops,
index 82bd00af5cc3841ba6cd44ef51024b5792ba09b5..268aae45b5149de12ff77e59df776790724dede8 100644 (file)
@@ -75,18 +75,18 @@ static char module_name[] = "lart";
 
 /* blob */
 #define NUM_BLOB_BLOCKS                FLASH_NUMBLOCKS_16m_PARAM
-#define BLOB_START                     0x00000000
-#define BLOB_LEN                       (NUM_BLOB_BLOCKS * FLASH_BLOCKSIZE_PARAM)
+#define PART_BLOB_START                0x00000000
+#define PART_BLOB_LEN          (NUM_BLOB_BLOCKS * FLASH_BLOCKSIZE_PARAM)
 
 /* kernel */
 #define NUM_KERNEL_BLOCKS      7
-#define KERNEL_START           (BLOB_START + BLOB_LEN)
-#define KERNEL_LEN                     (NUM_KERNEL_BLOCKS * FLASH_BLOCKSIZE_MAIN)
+#define PART_KERNEL_START      (PART_BLOB_START + PART_BLOB_LEN)
+#define PART_KERNEL_LEN                (NUM_KERNEL_BLOCKS * FLASH_BLOCKSIZE_MAIN)
 
 /* initial ramdisk */
 #define NUM_INITRD_BLOCKS      24
-#define INITRD_START           (KERNEL_START + KERNEL_LEN)
-#define INITRD_LEN                     (NUM_INITRD_BLOCKS * FLASH_BLOCKSIZE_MAIN)
+#define PART_INITRD_START      (PART_KERNEL_START + PART_KERNEL_LEN)
+#define PART_INITRD_LEN                (NUM_INITRD_BLOCKS * FLASH_BLOCKSIZE_MAIN)
 
 /*
  * See section 4.0 in "3 Volt Fast Boot Block Flash Memory" Intel Datasheet
@@ -587,20 +587,20 @@ static struct mtd_partition lart_partitions[] = {
        /* blob */
        {
                .name   = "blob",
-               .offset = BLOB_START,
-               .size   = BLOB_LEN,
+               .offset = PART_BLOB_START,
+               .size   = PART_BLOB_LEN,
        },
        /* kernel */
        {
                .name   = "kernel",
-               .offset = KERNEL_START,         /* MTDPART_OFS_APPEND */
-               .size   = KERNEL_LEN,
+               .offset = PART_KERNEL_START,    /* MTDPART_OFS_APPEND */
+               .size   = PART_KERNEL_LEN,
        },
        /* initial ramdisk / file system */
        {
                .name   = "file system",
-               .offset = INITRD_START,         /* MTDPART_OFS_APPEND */
-               .size   = INITRD_LEN,           /* MTDPART_SIZ_FULL */
+               .offset = PART_INITRD_START,    /* MTDPART_OFS_APPEND */
+               .size   = PART_INITRD_LEN,      /* MTDPART_SIZ_FULL */
        }
 };
 #define NUM_PARTITIONS ARRAY_SIZE(lart_partitions)
index d0d0d12b531fc683613455fc3ab6cb9fa3b9fea9..e536301acfdec9fd893d25061f4e4b64ca55c388 100644 (file)
@@ -293,36 +293,29 @@ static int xgene_enet_tx_completion(struct xgene_enet_desc_ring *cp_ring,
 static int xgene_enet_setup_mss(struct net_device *ndev, u32 mss)
 {
        struct xgene_enet_pdata *pdata = netdev_priv(ndev);
-       bool mss_index_found = false;
-       int mss_index;
+       int mss_index = -EBUSY;
        int i;
 
        spin_lock(&pdata->mss_lock);
 
        /* Reuse the slot if MSS matches */
-       for (i = 0; !mss_index_found && i < NUM_MSS_REG; i++) {
+       for (i = 0; mss_index < 0 && i < NUM_MSS_REG; i++) {
                if (pdata->mss[i] == mss) {
                        pdata->mss_refcnt[i]++;
                        mss_index = i;
-                       mss_index_found = true;
                }
        }
 
        /* Overwrite the slot with ref_count = 0 */
-       for (i = 0; !mss_index_found && i < NUM_MSS_REG; i++) {
+       for (i = 0; mss_index < 0 && i < NUM_MSS_REG; i++) {
                if (!pdata->mss_refcnt[i]) {
                        pdata->mss_refcnt[i]++;
                        pdata->mac_ops->set_mss(pdata, mss, i);
                        pdata->mss[i] = mss;
                        mss_index = i;
-                       mss_index_found = true;
                }
        }
 
-       /* No slots with ref_count = 0 available, return busy */
-       if (!mss_index_found)
-               mss_index = -EBUSY;
-
        spin_unlock(&pdata->mss_lock);
 
        return mss_index;
index e7b81a305469e64b97f68bc0e2bcb064b78f08fe..024788549c2569a13d3a07ebbde718cccf980a26 100644 (file)
@@ -89,10 +89,17 @@ void mlx4_en_remove_timestamp(struct mlx4_en_dev *mdev)
        }
 }
 
+#define MLX4_EN_WRAP_AROUND_SEC        10UL
+/* By scheduling the overflow check every 5 seconds, we have a reasonably
+ * good chance we wont miss a wrap around.
+ * TOTO: Use a timer instead of a work queue to increase the guarantee.
+ */
+#define MLX4_EN_OVERFLOW_PERIOD (MLX4_EN_WRAP_AROUND_SEC * HZ / 2)
+
 void mlx4_en_ptp_overflow_check(struct mlx4_en_dev *mdev)
 {
        bool timeout = time_is_before_jiffies(mdev->last_overflow_check +
-                                             mdev->overflow_period);
+                                             MLX4_EN_OVERFLOW_PERIOD);
        unsigned long flags;
 
        if (timeout) {
@@ -237,7 +244,6 @@ static const struct ptp_clock_info mlx4_en_ptp_clock_info = {
        .enable         = mlx4_en_phc_enable,
 };
 
-#define MLX4_EN_WRAP_AROUND_SEC        10ULL
 
 /* This function calculates the max shift that enables the user range
  * of MLX4_EN_WRAP_AROUND_SEC values in the cycles register.
@@ -258,7 +264,6 @@ void mlx4_en_init_timestamp(struct mlx4_en_dev *mdev)
 {
        struct mlx4_dev *dev = mdev->dev;
        unsigned long flags;
-       u64 ns, zero = 0;
 
        /* mlx4_en_init_timestamp is called for each netdev.
         * mdev->ptp_clock is common for all ports, skip initialization if
@@ -282,13 +287,6 @@ void mlx4_en_init_timestamp(struct mlx4_en_dev *mdev)
                         ktime_to_ns(ktime_get_real()));
        write_sequnlock_irqrestore(&mdev->clock_lock, flags);
 
-       /* Calculate period in seconds to call the overflow watchdog - to make
-        * sure counter is checked at least once every wrap around.
-        */
-       ns = cyclecounter_cyc2ns(&mdev->cycles, mdev->cycles.mask, zero, &zero);
-       do_div(ns, NSEC_PER_SEC / 2 / HZ);
-       mdev->overflow_period = ns;
-
        /* Configure the PHC */
        mdev->ptp_clock_info = mlx4_en_ptp_clock_info;
        snprintf(mdev->ptp_clock_info.name, 16, "mlx4 ptp");
index 4941b692e9479bc7800e92f9bfb13baf3ff7d0d4..3629ce11a68b9dec5c1659539bdc6f2c11114e35 100644 (file)
@@ -430,7 +430,6 @@ struct mlx4_en_dev {
        seqlock_t               clock_lock;
        struct timecounter      clock;
        unsigned long           last_overflow_check;
-       unsigned long           overflow_period;
        struct ptp_clock        *ptp_clock;
        struct ptp_clock_info   ptp_clock_info;
        struct notifier_block   nb;
index c5c1d0e0c16fbd57e7f848c5d738fff1df054dcb..118723ea681a45ff5f117c2b9cef017d815a6c0f 100644 (file)
@@ -5397,7 +5397,7 @@ static void s2io_ethtool_gdrvinfo(struct net_device *dev,
  *  s2io_nic structure.
  *  @regs : pointer to the structure with parameters given by ethtool for
  *  dumping the registers.
- *  @reg_space: The input argumnet into which all the registers are dumped.
+ *  @reg_space: The input argument into which all the registers are dumped.
  *  Description:
  *  Dumps the entire register space of xFrame NIC into the user given
  *  buffer area.
index db55e6d89cf45b7305c44fab69ac4d21cd90181f..0452848d1316b0b9e7c8b864eb9e200a6e5e2532 100644 (file)
@@ -119,7 +119,7 @@ static void vxge_ethtool_gdrvinfo(struct net_device *dev,
  * @dev: device pointer.
  * @regs: pointer to the structure with parameters given by ethtool for
  * dumping the registers.
- * @reg_space: The input argumnet into which all the registers are dumped.
+ * @reg_space: The input argument into which all the registers are dumped.
  *
  * Dumps the vpath register space of Titan NIC into the user given
  * buffer area.
index 61a9cd5be49734ce02f7a437763d267949707175..00c17fa6545bd5752a427e3660b062dc26ba57db 100644 (file)
@@ -688,7 +688,9 @@ static inline u8 qed_concrete_to_sw_fid(struct qed_dev *cdev,
 #define OOO_LB_TC 9
 
 int qed_configure_vport_wfq(struct qed_dev *cdev, u16 vp_id, u32 rate);
-void qed_configure_vp_wfq_on_link_change(struct qed_dev *cdev, u32 min_pf_rate);
+void qed_configure_vp_wfq_on_link_change(struct qed_dev *cdev,
+                                        struct qed_ptt *p_ptt,
+                                        u32 min_pf_rate);
 
 void qed_clean_wfq_db(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
 #define QED_LEADING_HWFN(dev)   (&dev->hwfns[0])
index d6c5a8165b5f42a9ec82a7a9f12f4aacf3f56bbd..e2a081ceaf520c429b90e1fcc1e2b6cb7d3b10aa 100644 (file)
@@ -3198,7 +3198,8 @@ int qed_configure_vport_wfq(struct qed_dev *cdev, u16 vp_id, u32 rate)
 }
 
 /* API to configure WFQ from mcp link change */
-void qed_configure_vp_wfq_on_link_change(struct qed_dev *cdev, u32 min_pf_rate)
+void qed_configure_vp_wfq_on_link_change(struct qed_dev *cdev,
+                                        struct qed_ptt *p_ptt, u32 min_pf_rate)
 {
        int i;
 
@@ -3212,8 +3213,7 @@ void qed_configure_vp_wfq_on_link_change(struct qed_dev *cdev, u32 min_pf_rate)
        for_each_hwfn(cdev, i) {
                struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
 
-               __qed_configure_vp_wfq_on_link_change(p_hwfn,
-                                                     p_hwfn->p_dpc_ptt,
+               __qed_configure_vp_wfq_on_link_change(p_hwfn, p_ptt,
                                                      min_pf_rate);
        }
 }
index 314022df34694758d524554caccb3e2d2f9ec3ba..87fde205149fdbf3181befd79ca62508b2daa388 100644 (file)
@@ -679,7 +679,8 @@ static void qed_mcp_handle_link_change(struct qed_hwfn *p_hwfn,
 
        /* Min bandwidth configuration */
        __qed_configure_pf_min_bandwidth(p_hwfn, p_ptt, p_link, min_bw);
-       qed_configure_vp_wfq_on_link_change(p_hwfn->cdev, p_link->min_pf_rate);
+       qed_configure_vp_wfq_on_link_change(p_hwfn->cdev, p_ptt,
+                                           p_link->min_pf_rate);
 
        p_link->an = !!(status & LINK_STATUS_AUTO_NEGOTIATE_ENABLED);
        p_link->an_complete = !!(status &
index 29ed785f1dc22699962ea15904118c3fee314207..253c2bbe1e4e1a705e52054b4d3faa199fd2ca93 100644 (file)
@@ -3014,8 +3014,7 @@ cleanup:
                ack_vfs[vfid / 32] |= BIT((vfid % 32));
                p_hwfn->pf_iov_info->pending_flr[rel_vf_id / 64] &=
                    ~(1ULL << (rel_vf_id % 64));
-               p_hwfn->pf_iov_info->pending_events[rel_vf_id / 64] &=
-                   ~(1ULL << (rel_vf_id % 64));
+               p_vf->vf_mbx.b_pending_msg = false;
        }
 
        return rc;
@@ -3128,11 +3127,20 @@ static void qed_iov_process_mbx_req(struct qed_hwfn *p_hwfn,
        mbx = &p_vf->vf_mbx;
 
        /* qed_iov_process_mbx_request */
-       DP_VERBOSE(p_hwfn, QED_MSG_IOV,
-                  "VF[%02x]: Processing mailbox message\n", p_vf->abs_vf_id);
+       if (!mbx->b_pending_msg) {
+               DP_NOTICE(p_hwfn,
+                         "VF[%02x]: Trying to process mailbox message when none is pending\n",
+                         p_vf->abs_vf_id);
+               return;
+       }
+       mbx->b_pending_msg = false;
 
        mbx->first_tlv = mbx->req_virt->first_tlv;
 
+       DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+                  "VF[%02x]: Processing mailbox message [type %04x]\n",
+                  p_vf->abs_vf_id, mbx->first_tlv.tl.type);
+
        /* check if tlv type is known */
        if (qed_iov_tlv_supported(mbx->first_tlv.tl.type) &&
            !p_vf->b_malicious) {
@@ -3219,20 +3227,19 @@ static void qed_iov_process_mbx_req(struct qed_hwfn *p_hwfn,
        }
 }
 
-static void qed_iov_pf_add_pending_events(struct qed_hwfn *p_hwfn, u8 vfid)
+void qed_iov_pf_get_pending_events(struct qed_hwfn *p_hwfn, u64 *events)
 {
-       u64 add_bit = 1ULL << (vfid % 64);
+       int i;
 
-       p_hwfn->pf_iov_info->pending_events[vfid / 64] |= add_bit;
-}
+       memset(events, 0, sizeof(u64) * QED_VF_ARRAY_LENGTH);
 
-static void qed_iov_pf_get_and_clear_pending_events(struct qed_hwfn *p_hwfn,
-                                                   u64 *events)
-{
-       u64 *p_pending_events = p_hwfn->pf_iov_info->pending_events;
+       qed_for_each_vf(p_hwfn, i) {
+               struct qed_vf_info *p_vf;
 
-       memcpy(events, p_pending_events, sizeof(u64) * QED_VF_ARRAY_LENGTH);
-       memset(p_pending_events, 0, sizeof(u64) * QED_VF_ARRAY_LENGTH);
+               p_vf = &p_hwfn->pf_iov_info->vfs_array[i];
+               if (p_vf->vf_mbx.b_pending_msg)
+                       events[i / 64] |= 1ULL << (i % 64);
+       }
 }
 
 static struct qed_vf_info *qed_sriov_get_vf_from_absid(struct qed_hwfn *p_hwfn,
@@ -3266,7 +3273,7 @@ static int qed_sriov_vfpf_msg(struct qed_hwfn *p_hwfn,
        p_vf->vf_mbx.pending_req = (((u64)vf_msg->hi) << 32) | vf_msg->lo;
 
        /* Mark the event and schedule the workqueue */
-       qed_iov_pf_add_pending_events(p_hwfn, p_vf->relative_vf_id);
+       p_vf->vf_mbx.b_pending_msg = true;
        qed_schedule_iov(p_hwfn, QED_IOV_WQ_MSG_FLAG);
 
        return 0;
@@ -4030,7 +4037,7 @@ static void qed_handle_vf_msg(struct qed_hwfn *hwfn)
                return;
        }
 
-       qed_iov_pf_get_and_clear_pending_events(hwfn, events);
+       qed_iov_pf_get_pending_events(hwfn, events);
 
        DP_VERBOSE(hwfn, QED_MSG_IOV,
                   "Event mask of VF events: 0x%llx 0x%llx 0x%llx\n",
index fc08cc2da6a7886d9e27a1f8e1743e4dcf73a16f..a89605821522d528411f711bbb0755c0ae003e5a 100644 (file)
@@ -140,6 +140,9 @@ struct qed_iov_vf_mbx {
        /* Address in VF where a pending message is located */
        dma_addr_t pending_req;
 
+       /* Message from VF awaits handling */
+       bool b_pending_msg;
+
        u8 *offset;
 
        /* saved VF request header */
@@ -232,7 +235,6 @@ struct qed_vf_info {
  */
 struct qed_pf_iov {
        struct qed_vf_info vfs_array[MAX_NUM_VFS];
-       u64 pending_events[QED_VF_ARRAY_LENGTH];
        u64 pending_flr[QED_VF_ARRAY_LENGTH];
 
        /* Allocate message address continuosuly and split to each VF */
index 144fe84e8a531e63ae14c3f3a9c5348a7407285f..04d9245b7149ce663b0827a158901a10b58ea701 100644 (file)
@@ -416,7 +416,7 @@ struct stmmac_dma_ops {
        /* Configure the AXI Bus Mode Register */
        void (*axi)(void __iomem *ioaddr, struct stmmac_axi *axi);
        /* Dump DMA registers */
-       void (*dump_regs) (void __iomem *ioaddr);
+       void (*dump_regs)(void __iomem *ioaddr, u32 *reg_space);
        /* Set tx/rx threshold in the csr6 register
         * An invalid value enables the store-and-forward mode */
        void (*dma_mode)(void __iomem *ioaddr, int txmode, int rxmode,
@@ -456,7 +456,7 @@ struct stmmac_ops {
        /* Enable RX Queues */
        void (*rx_queue_enable)(struct mac_device_info *hw, u32 queue);
        /* Dump MAC registers */
-       void (*dump_regs)(struct mac_device_info *hw);
+       void (*dump_regs)(struct mac_device_info *hw, u32 *reg_space);
        /* Handle extra events on specific interrupts hw dependent */
        int (*host_irq_status)(struct mac_device_info *hw,
                               struct stmmac_extra_stats *x);
index 91c8926b7479ab180fa70f66c1bf71309258d64e..19b9b308709953cc9327961d3eb3bea527848bb7 100644 (file)
@@ -92,17 +92,13 @@ static int dwmac1000_rx_ipc_enable(struct mac_device_info *hw)
        return !!(value & GMAC_CONTROL_IPC);
 }
 
-static void dwmac1000_dump_regs(struct mac_device_info *hw)
+static void dwmac1000_dump_regs(struct mac_device_info *hw, u32 *reg_space)
 {
        void __iomem *ioaddr = hw->pcsr;
        int i;
-       pr_info("\tDWMAC1000 regs (base addr = 0x%p)\n", ioaddr);
 
-       for (i = 0; i < 55; i++) {
-               int offset = i * 4;
-               pr_info("\tReg No. %d (offset 0x%x): 0x%08x\n", i,
-                       offset, readl(ioaddr + offset));
-       }
+       for (i = 0; i < 55; i++)
+               reg_space[i] = readl(ioaddr + i * 4);
 }
 
 static void dwmac1000_set_umac_addr(struct mac_device_info *hw,
index fbaec0ffd9ef6638ef5406631561f3ee6964f018..d3654a4470461e1f44282fac163dbcd9b6827df6 100644 (file)
@@ -201,18 +201,14 @@ static void dwmac1000_dma_operation_mode(void __iomem *ioaddr, int txmode,
        writel(csr6, ioaddr + DMA_CONTROL);
 }
 
-static void dwmac1000_dump_dma_regs(void __iomem *ioaddr)
+static void dwmac1000_dump_dma_regs(void __iomem *ioaddr, u32 *reg_space)
 {
        int i;
-       pr_info(" DMA registers\n");
-       for (i = 0; i < 22; i++) {
-               if ((i < 9) || (i > 17)) {
-                       int offset = i * 4;
-                       pr_err("\t Reg No. %d (offset 0x%x): 0x%08x\n", i,
-                              (DMA_BUS_MODE + offset),
-                              readl(ioaddr + DMA_BUS_MODE + offset));
-               }
-       }
+
+       for (i = 0; i < 22; i++)
+               if ((i < 9) || (i > 17))
+                       reg_space[DMA_BUS_MODE / 4 + i] =
+                               readl(ioaddr + DMA_BUS_MODE + i * 4);
 }
 
 static void dwmac1000_get_hw_feature(void __iomem *ioaddr,
index 8ab518997b1b509a5cf72343ac42c9ef1993e02c..e370ccec6176671d1717d24d88917b88f69b1bd2 100644 (file)
@@ -40,28 +40,18 @@ static void dwmac100_core_init(struct mac_device_info *hw, int mtu)
 #endif
 }
 
-static void dwmac100_dump_mac_regs(struct mac_device_info *hw)
+static void dwmac100_dump_mac_regs(struct mac_device_info *hw, u32 *reg_space)
 {
        void __iomem *ioaddr = hw->pcsr;
-       pr_info("\t----------------------------------------------\n"
-               "\t  DWMAC 100 CSR (base addr = 0x%p)\n"
-               "\t----------------------------------------------\n", ioaddr);
-       pr_info("\tcontrol reg (offset 0x%x): 0x%08x\n", MAC_CONTROL,
-               readl(ioaddr + MAC_CONTROL));
-       pr_info("\taddr HI (offset 0x%x): 0x%08x\n ", MAC_ADDR_HIGH,
-               readl(ioaddr + MAC_ADDR_HIGH));
-       pr_info("\taddr LO (offset 0x%x): 0x%08x\n", MAC_ADDR_LOW,
-               readl(ioaddr + MAC_ADDR_LOW));
-       pr_info("\tmulticast hash HI (offset 0x%x): 0x%08x\n",
-               MAC_HASH_HIGH, readl(ioaddr + MAC_HASH_HIGH));
-       pr_info("\tmulticast hash LO (offset 0x%x): 0x%08x\n",
-               MAC_HASH_LOW, readl(ioaddr + MAC_HASH_LOW));
-       pr_info("\tflow control (offset 0x%x): 0x%08x\n",
-               MAC_FLOW_CTRL, readl(ioaddr + MAC_FLOW_CTRL));
-       pr_info("\tVLAN1 tag (offset 0x%x): 0x%08x\n", MAC_VLAN1,
-               readl(ioaddr + MAC_VLAN1));
-       pr_info("\tVLAN2 tag (offset 0x%x): 0x%08x\n", MAC_VLAN2,
-               readl(ioaddr + MAC_VLAN2));
+
+       reg_space[MAC_CONTROL / 4] = readl(ioaddr + MAC_CONTROL);
+       reg_space[MAC_ADDR_HIGH / 4] = readl(ioaddr + MAC_ADDR_HIGH);
+       reg_space[MAC_ADDR_LOW / 4] = readl(ioaddr + MAC_ADDR_LOW);
+       reg_space[MAC_HASH_HIGH / 4] = readl(ioaddr + MAC_HASH_HIGH);
+       reg_space[MAC_HASH_LOW / 4] = readl(ioaddr + MAC_HASH_LOW);
+       reg_space[MAC_FLOW_CTRL / 4] = readl(ioaddr + MAC_FLOW_CTRL);
+       reg_space[MAC_VLAN1 / 4] = readl(ioaddr + MAC_VLAN1);
+       reg_space[MAC_VLAN2 / 4] = readl(ioaddr + MAC_VLAN2);
 }
 
 static int dwmac100_rx_ipc_enable(struct mac_device_info *hw)
index d40e91e8fc7bde6352d9a5a141707cff69d8a428..eef2f222ce9a87f91a1d02fa6a6c7354486d4231 100644 (file)
@@ -66,19 +66,18 @@ static void dwmac100_dma_operation_mode(void __iomem *ioaddr, int txmode,
        writel(csr6, ioaddr + DMA_CONTROL);
 }
 
-static void dwmac100_dump_dma_regs(void __iomem *ioaddr)
+static void dwmac100_dump_dma_regs(void __iomem *ioaddr, u32 *reg_space)
 {
        int i;
 
-       pr_debug("DWMAC 100 DMA CSR\n");
        for (i = 0; i < 9; i++)
-               pr_debug("\t CSR%d (offset 0x%x): 0x%08x\n", i,
-                        (DMA_BUS_MODE + i * 4),
-                        readl(ioaddr + DMA_BUS_MODE + i * 4));
+               reg_space[DMA_BUS_MODE / 4 + i] =
+                       readl(ioaddr + DMA_BUS_MODE + i * 4);
 
-       pr_debug("\tCSR20 (0x%x): 0x%08x, CSR21 (0x%x): 0x%08x\n",
-                DMA_CUR_TX_BUF_ADDR, readl(ioaddr + DMA_CUR_TX_BUF_ADDR),
-                DMA_CUR_RX_BUF_ADDR, readl(ioaddr + DMA_CUR_RX_BUF_ADDR));
+       reg_space[DMA_CUR_TX_BUF_ADDR / 4] =
+               readl(ioaddr + DMA_CUR_TX_BUF_ADDR);
+       reg_space[DMA_CUR_RX_BUF_ADDR / 4] =
+               readl(ioaddr + DMA_CUR_RX_BUF_ADDR);
 }
 
 /* DMA controller has two counters to track the number of the missed frames. */
index 202216cd6789176e1cf9164cfb3faf6b58c0ba89..1e79e6529c4a79a805663e2d65f2cec558f362e3 100644 (file)
@@ -70,19 +70,13 @@ static void dwmac4_rx_queue_enable(struct mac_device_info *hw, u32 queue)
        writel(value, ioaddr + GMAC_RXQ_CTRL0);
 }
 
-static void dwmac4_dump_regs(struct mac_device_info *hw)
+static void dwmac4_dump_regs(struct mac_device_info *hw, u32 *reg_space)
 {
        void __iomem *ioaddr = hw->pcsr;
        int i;
 
-       pr_debug("\tDWMAC4 regs (base addr = 0x%p)\n", ioaddr);
-
-       for (i = 0; i < GMAC_REG_NUM; i++) {
-               int offset = i * 4;
-
-               pr_debug("\tReg No. %d (offset 0x%x): 0x%08x\n", i,
-                        offset, readl(ioaddr + offset));
-       }
+       for (i = 0; i < GMAC_REG_NUM; i++)
+               reg_space[i] = readl(ioaddr + i * 4);
 }
 
 static int dwmac4_rx_ipc_enable(struct mac_device_info *hw)
index 377d1b44d4f2802650819b37d7e27fe78115bf41..f97b0d5d998742efcad71972bd74ce40cc02afad 100644 (file)
@@ -127,53 +127,51 @@ static void dwmac4_dma_init(void __iomem *ioaddr,
                dwmac4_dma_init_channel(ioaddr, dma_cfg, dma_tx, dma_rx, i);
 }
 
-static void _dwmac4_dump_dma_regs(void __iomem *ioaddr, u32 channel)
+static void _dwmac4_dump_dma_regs(void __iomem *ioaddr, u32 channel,
+                                 u32 *reg_space)
 {
-       pr_debug(" Channel %d\n", channel);
-       pr_debug("\tDMA_CHAN_CONTROL, offset: 0x%x, val: 0x%x\n", 0,
-                readl(ioaddr + DMA_CHAN_CONTROL(channel)));
-       pr_debug("\tDMA_CHAN_TX_CONTROL, offset: 0x%x, val: 0x%x\n", 0x4,
-                readl(ioaddr + DMA_CHAN_TX_CONTROL(channel)));
-       pr_debug("\tDMA_CHAN_RX_CONTROL, offset: 0x%x, val: 0x%x\n", 0x8,
-                readl(ioaddr + DMA_CHAN_RX_CONTROL(channel)));
-       pr_debug("\tDMA_CHAN_TX_BASE_ADDR, offset: 0x%x, val: 0x%x\n", 0x14,
-                readl(ioaddr + DMA_CHAN_TX_BASE_ADDR(channel)));
-       pr_debug("\tDMA_CHAN_RX_BASE_ADDR, offset: 0x%x, val: 0x%x\n", 0x1c,
-                readl(ioaddr + DMA_CHAN_RX_BASE_ADDR(channel)));
-       pr_debug("\tDMA_CHAN_TX_END_ADDR, offset: 0x%x, val: 0x%x\n", 0x20,
-                readl(ioaddr + DMA_CHAN_TX_END_ADDR(channel)));
-       pr_debug("\tDMA_CHAN_RX_END_ADDR, offset: 0x%x, val: 0x%x\n", 0x28,
-                readl(ioaddr + DMA_CHAN_RX_END_ADDR(channel)));
-       pr_debug("\tDMA_CHAN_TX_RING_LEN, offset: 0x%x, val: 0x%x\n", 0x2c,
-                readl(ioaddr + DMA_CHAN_TX_RING_LEN(channel)));
-       pr_debug("\tDMA_CHAN_RX_RING_LEN, offset: 0x%x, val: 0x%x\n", 0x30,
-                readl(ioaddr + DMA_CHAN_RX_RING_LEN(channel)));
-       pr_debug("\tDMA_CHAN_INTR_ENA, offset: 0x%x, val: 0x%x\n", 0x34,
-                readl(ioaddr + DMA_CHAN_INTR_ENA(channel)));
-       pr_debug("\tDMA_CHAN_RX_WATCHDOG, offset: 0x%x, val: 0x%x\n", 0x38,
-                readl(ioaddr + DMA_CHAN_RX_WATCHDOG(channel)));
-       pr_debug("\tDMA_CHAN_SLOT_CTRL_STATUS, offset: 0x%x, val: 0x%x\n", 0x3c,
-                readl(ioaddr + DMA_CHAN_SLOT_CTRL_STATUS(channel)));
-       pr_debug("\tDMA_CHAN_CUR_TX_DESC, offset: 0x%x, val: 0x%x\n", 0x44,
-                readl(ioaddr + DMA_CHAN_CUR_TX_DESC(channel)));
-       pr_debug("\tDMA_CHAN_CUR_RX_DESC, offset: 0x%x, val: 0x%x\n", 0x4c,
-                readl(ioaddr + DMA_CHAN_CUR_RX_DESC(channel)));
-       pr_debug("\tDMA_CHAN_CUR_TX_BUF_ADDR, offset: 0x%x, val: 0x%x\n", 0x54,
-                readl(ioaddr + DMA_CHAN_CUR_TX_BUF_ADDR(channel)));
-       pr_debug("\tDMA_CHAN_CUR_RX_BUF_ADDR, offset: 0x%x, val: 0x%x\n", 0x5c,
-                readl(ioaddr + DMA_CHAN_CUR_RX_BUF_ADDR(channel)));
-       pr_debug("\tDMA_CHAN_STATUS, offset: 0x%x, val: 0x%x\n", 0x60,
-                readl(ioaddr + DMA_CHAN_STATUS(channel)));
+       reg_space[DMA_CHAN_CONTROL(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_CONTROL(channel));
+       reg_space[DMA_CHAN_TX_CONTROL(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_TX_CONTROL(channel));
+       reg_space[DMA_CHAN_RX_CONTROL(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_RX_CONTROL(channel));
+       reg_space[DMA_CHAN_TX_BASE_ADDR(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_TX_BASE_ADDR(channel));
+       reg_space[DMA_CHAN_RX_BASE_ADDR(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_RX_BASE_ADDR(channel));
+       reg_space[DMA_CHAN_TX_END_ADDR(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_TX_END_ADDR(channel));
+       reg_space[DMA_CHAN_RX_END_ADDR(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_RX_END_ADDR(channel));
+       reg_space[DMA_CHAN_TX_RING_LEN(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_TX_RING_LEN(channel));
+       reg_space[DMA_CHAN_RX_RING_LEN(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_RX_RING_LEN(channel));
+       reg_space[DMA_CHAN_INTR_ENA(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_INTR_ENA(channel));
+       reg_space[DMA_CHAN_RX_WATCHDOG(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_RX_WATCHDOG(channel));
+       reg_space[DMA_CHAN_SLOT_CTRL_STATUS(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_SLOT_CTRL_STATUS(channel));
+       reg_space[DMA_CHAN_CUR_TX_DESC(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_CUR_TX_DESC(channel));
+       reg_space[DMA_CHAN_CUR_RX_DESC(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_CUR_RX_DESC(channel));
+       reg_space[DMA_CHAN_CUR_TX_BUF_ADDR(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_CUR_TX_BUF_ADDR(channel));
+       reg_space[DMA_CHAN_CUR_RX_BUF_ADDR(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_CUR_RX_BUF_ADDR(channel));
+       reg_space[DMA_CHAN_STATUS(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_STATUS(channel));
 }
 
-static void dwmac4_dump_dma_regs(void __iomem *ioaddr)
+static void dwmac4_dump_dma_regs(void __iomem *ioaddr, u32 *reg_space)
 {
        int i;
 
-       pr_debug(" GMAC4 DMA registers\n");
-
        for (i = 0; i < DMA_CHANNEL_NB_MAX; i++)
-               _dwmac4_dump_dma_regs(ioaddr, i);
+               _dwmac4_dump_dma_regs(ioaddr, i, reg_space);
 }
 
 static void dwmac4_rx_watchdog(void __iomem *ioaddr, u32 riwt)
index 5ff6bc4eb8f1cc7541433a6c70b25cfa71e458a9..85d64114e159e6d76a03fe5cca839fb246a6e0e6 100644 (file)
@@ -435,32 +435,14 @@ static int stmmac_ethtool_get_regs_len(struct net_device *dev)
 static void stmmac_ethtool_gregs(struct net_device *dev,
                          struct ethtool_regs *regs, void *space)
 {
-       int i;
        u32 *reg_space = (u32 *) space;
 
        struct stmmac_priv *priv = netdev_priv(dev);
 
        memset(reg_space, 0x0, REG_SPACE_SIZE);
 
-       if (priv->plat->has_gmac || priv->plat->has_gmac4) {
-               /* MAC registers */
-               for (i = 0; i < 55; i++)
-                       reg_space[i] = readl(priv->ioaddr + (i * 4));
-               /* DMA registers */
-               for (i = 0; i < 22; i++)
-                       reg_space[i + 55] =
-                           readl(priv->ioaddr + (DMA_BUS_MODE + (i * 4)));
-       } else {
-               /* MAC registers */
-               for (i = 0; i < 12; i++)
-                       reg_space[i] = readl(priv->ioaddr + (i * 4));
-               /* DMA registers */
-               for (i = 0; i < 9; i++)
-                       reg_space[i + 12] =
-                           readl(priv->ioaddr + (DMA_BUS_MODE + (i * 4)));
-               reg_space[22] = readl(priv->ioaddr + DMA_CUR_TX_BUF_ADDR);
-               reg_space[23] = readl(priv->ioaddr + DMA_CUR_RX_BUF_ADDR);
-       }
+       priv->hw->mac->dump_regs(priv->hw, reg_space);
+       priv->hw->dma->dump_regs(priv->ioaddr, reg_space);
 }
 
 static void
index 3cbe09682afe7a719be0a3b83a33215ed50adce9..4498a3861aa3ad09460e922bd7f38e3506889dcb 100644 (file)
@@ -1729,11 +1729,6 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp)
        priv->hw->dma->start_tx(priv->ioaddr);
        priv->hw->dma->start_rx(priv->ioaddr);
 
-       /* Dump DMA/MAC registers */
-       if (netif_msg_hw(priv)) {
-               priv->hw->mac->dump_regs(priv->hw);
-               priv->hw->dma->dump_regs(priv->ioaddr);
-       }
        priv->tx_lpi_timer = STMMAC_DEFAULT_TWT_LS;
 
        if ((priv->use_riwt) && (priv->hw->dma->rx_watchdog)) {
index d6f7838455dd7874b61ddccb31ce70748145c8be..1be69d8bc90948e82f92736b8f7ee9d274b9bd2b 100644 (file)
@@ -146,7 +146,7 @@ static int phy_config_interrupt(struct phy_device *phydev, u32 interrupts)
  */
 int phy_aneg_done(struct phy_device *phydev)
 {
-       if (phydev->drv->aneg_done)
+       if (phydev->drv && phydev->drv->aneg_done)
                return phydev->drv->aneg_done(phydev);
 
        return genphy_aneg_done(phydev);
index 556953f5343720580a9ac44688adb07a70339a44..b7911994112aebecc691117e15ab32a7c238fdf9 100644 (file)
@@ -2035,7 +2035,6 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
        const struct iphdr *old_iph = ip_hdr(skb);
        union vxlan_addr *dst;
        union vxlan_addr remote_ip, local_ip;
-       union vxlan_addr *src;
        struct vxlan_metadata _md;
        struct vxlan_metadata *md = &_md;
        __be16 src_port = 0, dst_port;
@@ -2062,7 +2061,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 
                dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port;
                vni = (rdst->remote_vni) ? : default_vni;
-               src = &vxlan->cfg.saddr;
+               local_ip = vxlan->cfg.saddr;
                dst_cache = &rdst->dst_cache;
                md->gbp = skb->mark;
                ttl = vxlan->cfg.ttl;
@@ -2095,7 +2094,6 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
                dst = &remote_ip;
                dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;
                vni = tunnel_id_to_key32(info->key.tun_id);
-               src = &local_ip;
                dst_cache = &info->dst_cache;
                if (info->options_len)
                        md = ip_tunnel_info_opts(info);
@@ -2115,7 +2113,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
                rt = vxlan_get_route(vxlan, dev, sock4, skb,
                                     rdst ? rdst->remote_ifindex : 0, tos,
                                     dst->sin.sin_addr.s_addr,
-                                    &src->sin.sin_addr.s_addr,
+                                    &local_ip.sin.sin_addr.s_addr,
                                     dst_port, src_port,
                                     dst_cache, info);
                if (IS_ERR(rt)) {
@@ -2142,7 +2140,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
                if (err < 0)
                        goto tx_error;
 
-               udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, src->sin.sin_addr.s_addr,
+               udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, local_ip.sin.sin_addr.s_addr,
                                    dst->sin.sin_addr.s_addr, tos, ttl, df,
                                    src_port, dst_port, xnet, !udp_sum);
 #if IS_ENABLED(CONFIG_IPV6)
@@ -2152,7 +2150,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
                ndst = vxlan6_get_route(vxlan, dev, sock6, skb,
                                        rdst ? rdst->remote_ifindex : 0, tos,
                                        label, &dst->sin6.sin6_addr,
-                                       &src->sin6.sin6_addr,
+                                       &local_ip.sin6.sin6_addr,
                                        dst_port, src_port,
                                        dst_cache, info);
                if (IS_ERR(ndst)) {
@@ -2180,7 +2178,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
                        goto tx_error;
 
                udp_tunnel6_xmit_skb(ndst, sock6->sock->sk, skb, dev,
-                                    &src->sin6.sin6_addr,
+                                    &local_ip.sin6.sin6_addr,
                                     &dst->sin6.sin6_addr, tos, ttl,
                                     label, src_port, dst_port, !udp_sum);
 #endif
@@ -2675,7 +2673,7 @@ static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
 
        if (data[IFLA_VXLAN_ID]) {
                __u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);
-               if (id >= VXLAN_VID_MASK)
+               if (id >= VXLAN_N_VID)
                        return -ERANGE;
        }
 
index 7ce35aec8c765566bd4eb87fd473fdceddf3ec92..f297a9e1864293d4eedfac5dea5957e65af1b25d 100644 (file)
@@ -391,6 +391,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
                        nr_pages = i;
                        if (nr_pages > 0) {
                                len = nr_pages << PAGE_SHIFT;
+                               osd_req_op_extent_update(req, 0, len);
                                break;
                        }
                        goto out_pages;
@@ -771,7 +772,7 @@ static int ceph_writepages_start(struct address_space *mapping,
             wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
             (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
 
-       if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+       if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
                if (ci->i_wrbuffer_ref > 0) {
                        pr_warn_ratelimited(
                                "writepage_start %p %lld forced umount\n",
@@ -1017,8 +1018,7 @@ new_request:
                                        &ci->i_layout, vino,
                                        offset, &len, 0, num_ops,
                                        CEPH_OSD_OP_WRITE,
-                                       CEPH_OSD_FLAG_WRITE |
-                                       CEPH_OSD_FLAG_ONDISK,
+                                       CEPH_OSD_FLAG_WRITE,
                                        snapc, truncate_seq,
                                        truncate_size, false);
                if (IS_ERR(req)) {
@@ -1028,8 +1028,7 @@ new_request:
                                                min(num_ops,
                                                    CEPH_OSD_SLAB_OPS),
                                                CEPH_OSD_OP_WRITE,
-                                               CEPH_OSD_FLAG_WRITE |
-                                               CEPH_OSD_FLAG_ONDISK,
+                                               CEPH_OSD_FLAG_WRITE,
                                                snapc, truncate_seq,
                                                truncate_size, true);
                        BUG_ON(IS_ERR(req));
@@ -1194,7 +1193,7 @@ static int ceph_update_writeable_page(struct file *file,
        int r;
        struct ceph_snap_context *snapc, *oldest;
 
-       if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+       if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
                dout(" page %p forced umount\n", page);
                unlock_page(page);
                return -EIO;
@@ -1681,8 +1680,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
 
        req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
                                    ceph_vino(inode), 0, &len, 0, 1,
-                                   CEPH_OSD_OP_CREATE,
-                                   CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
+                                   CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_WRITE,
                                    NULL, 0, 0, false);
        if (IS_ERR(req)) {
                err = PTR_ERR(req);
@@ -1699,8 +1697,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
 
        req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
                                    ceph_vino(inode), 0, &len, 1, 3,
-                                   CEPH_OSD_OP_WRITE,
-                                   CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
+                                   CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
                                    NULL, ci->i_truncate_seq,
                                    ci->i_truncate_size, false);
        if (IS_ERR(req)) {
@@ -1873,7 +1870,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
                goto out_unlock;
        }
 
-       wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK;
+       wr_req->r_flags = CEPH_OSD_FLAG_WRITE;
        osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
        ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
        ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
index 5bc5d37b121712a2f288ede38b46420d13a2f0e5..4e7421caf3804c49ef052c02a736965e29437876 100644 (file)
@@ -234,7 +234,7 @@ void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp)
                fscache_enable_cookie(ci->fscache, ceph_fscache_can_enable,
                                inode);
                if (fscache_cookie_enabled(ci->fscache)) {
-                       dout("fscache_file_set_cookie %p %p enabing cache\n",
+                       dout("fscache_file_set_cookie %p %p enabling cache\n",
                             inode, filp);
                }
        }
index 94fd76d04683d88103b42ff71a02490201a9783f..cd966f276a8d70ee9a3daa50c46eee5b1284f37a 100644 (file)
@@ -867,7 +867,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
 /*
  * Return caps we have registered with the MDS(s) as 'wanted'.
  */
-int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
+int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
 {
        struct ceph_cap *cap;
        struct rb_node *p;
@@ -875,7 +875,7 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
 
        for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
                cap = rb_entry(p, struct ceph_cap, ci_node);
-               if (!__cap_is_valid(cap))
+               if (check && !__cap_is_valid(cap))
                        continue;
                if (cap == ci->i_auth_cap)
                        mds_wanted |= cap->mds_wanted;
@@ -1184,6 +1184,13 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
                delayed = 1;
        }
        ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
+       if (want & ~cap->mds_wanted) {
+               /* user space may open/close single file frequently.
+                * This avoids droping mds_wanted immediately after
+                * requesting new mds_wanted.
+                */
+               __cap_set_timeouts(mdsc, ci);
+       }
 
        cap->issued &= retain;  /* drop bits we don't want */
        if (cap->implemented & ~cap->issued) {
@@ -2084,8 +2091,6 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 
        dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
 
-       ceph_sync_write_wait(inode);
-
        ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
        if (ret < 0)
                goto out;
@@ -2477,23 +2482,22 @@ again:
 
                if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
                        int mds_wanted;
-                       if (ACCESS_ONCE(mdsc->fsc->mount_state) ==
+                       if (READ_ONCE(mdsc->fsc->mount_state) ==
                            CEPH_MOUNT_SHUTDOWN) {
                                dout("get_cap_refs %p forced umount\n", inode);
                                *err = -EIO;
                                ret = 1;
                                goto out_unlock;
                        }
-                       mds_wanted = __ceph_caps_mds_wanted(ci);
-                       if ((mds_wanted & need) != need) {
+                       mds_wanted = __ceph_caps_mds_wanted(ci, false);
+                       if (need & ~(mds_wanted & need)) {
                                dout("get_cap_refs %p caps were dropped"
                                     " (session killed?)\n", inode);
                                *err = -ESTALE;
                                ret = 1;
                                goto out_unlock;
                        }
-                       if ((mds_wanted & file_wanted) ==
-                           (file_wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
+                       if (!(file_wanted & ~mds_wanted))
                                ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
                }
 
@@ -3404,6 +3408,7 @@ retry:
                        tcap->implemented |= issued;
                        if (cap == ci->i_auth_cap)
                                ci->i_auth_cap = tcap;
+
                        if (!list_empty(&ci->i_cap_flush_list) &&
                            ci->i_auth_cap == tcap) {
                                spin_lock(&mdsc->cap_dirty_lock);
@@ -3417,9 +3422,18 @@ retry:
        } else if (tsession) {
                /* add placeholder for the export tagert */
                int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
+               tcap = new_cap;
                ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
                             t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
 
+               if (!list_empty(&ci->i_cap_flush_list) &&
+                   ci->i_auth_cap == tcap) {
+                       spin_lock(&mdsc->cap_dirty_lock);
+                       list_move_tail(&ci->i_flushing_item,
+                                      &tcap->session->s_cap_flushing);
+                       spin_unlock(&mdsc->cap_dirty_lock);
+               }
+
                __ceph_remove_cap(cap, false);
                goto out_unlock;
        }
@@ -3924,9 +3938,10 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
 }
 
 int ceph_encode_dentry_release(void **p, struct dentry *dentry,
+                              struct inode *dir,
                               int mds, int drop, int unless)
 {
-       struct inode *dir = d_inode(dentry->d_parent);
+       struct dentry *parent = NULL;
        struct ceph_mds_request_release *rel = *p;
        struct ceph_dentry_info *di = ceph_dentry(dentry);
        int force = 0;
@@ -3941,9 +3956,14 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
        spin_lock(&dentry->d_lock);
        if (di->lease_session && di->lease_session->s_mds == mds)
                force = 1;
+       if (!dir) {
+               parent = dget(dentry->d_parent);
+               dir = d_inode(parent);
+       }
        spin_unlock(&dentry->d_lock);
 
        ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
+       dput(parent);
 
        spin_lock(&dentry->d_lock);
        if (ret && di->lease_session && di->lease_session->s_mds == mds) {
index 39ff678e567fcb5c31d9729081119adaa4578def..f2ae393e2c31a2b3dbca7a5f81eeb5b81afa5e1b 100644 (file)
@@ -70,7 +70,7 @@ static int mdsc_show(struct seq_file *s, void *p)
 
                seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
 
-               if (req->r_got_unsafe)
+               if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
                        seq_puts(s, "\t(unsafe)");
                else
                        seq_puts(s, "\t");
index 8ab1fdf0bd49b74f380a578aea92ce738393403d..3e9ad501addfe92f171a40dffb93c65209819cbe 100644 (file)
@@ -371,7 +371,7 @@ more:
                /* hints to request -> mds selection code */
                req->r_direct_mode = USE_AUTH_MDS;
                req->r_direct_hash = ceph_frag_value(frag);
-               req->r_direct_is_hash = true;
+               __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
                if (fi->last_name) {
                        req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL);
                        if (!req->r_path2) {
@@ -417,7 +417,7 @@ more:
                fi->frag = frag;
                fi->last_readdir = req;
 
-               if (req->r_did_prepopulate) {
+               if (test_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags)) {
                        fi->readdir_cache_idx = req->r_readdir_cache_idx;
                        if (fi->readdir_cache_idx < 0) {
                                /* preclude from marking dir ordered */
@@ -752,7 +752,8 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
                mask |= CEPH_CAP_XATTR_SHARED;
        req->r_args.getattr.mask = cpu_to_le32(mask);
 
-       req->r_locked_dir = dir;
+       req->r_parent = dir;
+       set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        err = ceph_mdsc_do_request(mdsc, NULL, req);
        err = ceph_handle_snapdir(req, dentry, err);
        dentry = ceph_finish_lookup(req, dentry, err);
@@ -813,7 +814,8 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
        }
        req->r_dentry = dget(dentry);
        req->r_num_caps = 2;
-       req->r_locked_dir = dir;
+       req->r_parent = dir;
+       set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        req->r_args.mknod.mode = cpu_to_le32(mode);
        req->r_args.mknod.rdev = cpu_to_le32(rdev);
        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
@@ -864,7 +866,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
                ceph_mdsc_put_request(req);
                goto out;
        }
-       req->r_locked_dir = dir;
+       req->r_parent = dir;
+       set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        req->r_dentry = dget(dentry);
        req->r_num_caps = 2;
        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
@@ -913,7 +916,8 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 
        req->r_dentry = dget(dentry);
        req->r_num_caps = 2;
-       req->r_locked_dir = dir;
+       req->r_parent = dir;
+       set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        req->r_args.mkdir.mode = cpu_to_le32(mode);
        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
        req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
@@ -957,7 +961,8 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
        req->r_dentry = dget(dentry);
        req->r_num_caps = 2;
        req->r_old_dentry = dget(old_dentry);
-       req->r_locked_dir = dir;
+       req->r_parent = dir;
+       set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
        req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
        /* release LINK_SHARED on source inode (mds will lock it) */
@@ -1023,7 +1028,8 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
        }
        req->r_dentry = dget(dentry);
        req->r_num_caps = 2;
-       req->r_locked_dir = dir;
+       req->r_parent = dir;
+       set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
        req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
        req->r_inode_drop = drop_caps_for_unlink(inode);
@@ -1066,7 +1072,8 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
        req->r_num_caps = 2;
        req->r_old_dentry = dget(old_dentry);
        req->r_old_dentry_dir = old_dir;
-       req->r_locked_dir = new_dir;
+       req->r_parent = new_dir;
+       set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
        req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
@@ -1194,7 +1201,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
        struct inode *dir;
 
        if (flags & LOOKUP_RCU) {
-               parent = ACCESS_ONCE(dentry->d_parent);
+               parent = READ_ONCE(dentry->d_parent);
                dir = d_inode_rcu(parent);
                if (!dir)
                        return -ECHILD;
@@ -1237,11 +1244,12 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
                        return -ECHILD;
 
                op = ceph_snap(dir) == CEPH_SNAPDIR ?
-                       CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_GETATTR;
+                       CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
                req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
                if (!IS_ERR(req)) {
                        req->r_dentry = dget(dentry);
-                       req->r_num_caps = op == CEPH_MDS_OP_GETATTR ? 1 : 2;
+                       req->r_num_caps = 2;
+                       req->r_parent = dir;
 
                        mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
                        if (ceph_security_xattr_wanted(dir))
index 180bbef760f2c8c12fd94d458c246014634233dd..e8f11fa565c53ac58fddf402f6ade6320d47d490 100644 (file)
@@ -207,7 +207,8 @@ static int ceph_get_name(struct dentry *parent, char *name,
        req->r_inode = d_inode(child);
        ihold(d_inode(child));
        req->r_ino2 = ceph_vino(d_inode(parent));
-       req->r_locked_dir = d_inode(parent);
+       req->r_parent = d_inode(parent);
+       set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        req->r_num_caps = 2;
        err = ceph_mdsc_do_request(mdsc, NULL, req);
 
index 045d30d2662485a4207945757659383fb314fa10..26cc95421cca6e62ef10bfd32b18cf1e526b7c51 100644 (file)
@@ -283,7 +283,7 @@ int ceph_open(struct inode *inode, struct file *file)
        spin_lock(&ci->i_ceph_lock);
        if (__ceph_is_any_real_caps(ci) &&
            (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
-               int mds_wanted = __ceph_caps_mds_wanted(ci);
+               int mds_wanted = __ceph_caps_mds_wanted(ci, true);
                int issued = __ceph_caps_issued(ci, NULL);
 
                dout("open %p fmode %d want %s issued %s using existing\n",
@@ -379,7 +379,8 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
                mask |= CEPH_CAP_XATTR_SHARED;
        req->r_args.open.mask = cpu_to_le32(mask);
 
-       req->r_locked_dir = dir;           /* caller holds dir->i_mutex */
+       req->r_parent = dir;
+       set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        err = ceph_mdsc_do_request(mdsc,
                                   (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
                                   req);
@@ -758,9 +759,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
                goto out;
        }
 
-       req->r_flags =  CEPH_OSD_FLAG_ORDERSNAP |
-                       CEPH_OSD_FLAG_ONDISK |
-                       CEPH_OSD_FLAG_WRITE;
+       req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE;
        ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
        ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
 
@@ -794,89 +793,6 @@ out:
        kfree(aio_work);
 }
 
-/*
- * Write commit request unsafe callback, called to tell us when a
- * request is unsafe (that is, in flight--has been handed to the
- * messenger to send to its target osd).  It is called again when
- * we've received a response message indicating the request is
- * "safe" (its CEPH_OSD_FLAG_ONDISK flag is set), or when a request
- * is completed early (and unsuccessfully) due to a timeout or
- * interrupt.
- *
- * This is used if we requested both an ACK and ONDISK commit reply
- * from the OSD.
- */
-static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
-{
-       struct ceph_inode_info *ci = ceph_inode(req->r_inode);
-
-       dout("%s %p tid %llu %ssafe\n", __func__, req, req->r_tid,
-               unsafe ? "un" : "");
-       if (unsafe) {
-               ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
-               spin_lock(&ci->i_unsafe_lock);
-               list_add_tail(&req->r_unsafe_item,
-                             &ci->i_unsafe_writes);
-               spin_unlock(&ci->i_unsafe_lock);
-
-               complete_all(&req->r_completion);
-       } else {
-               spin_lock(&ci->i_unsafe_lock);
-               list_del_init(&req->r_unsafe_item);
-               spin_unlock(&ci->i_unsafe_lock);
-               ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
-       }
-}
-
-/*
- * Wait on any unsafe replies for the given inode.  First wait on the
- * newest request, and make that the upper bound.  Then, if there are
- * more requests, keep waiting on the oldest as long as it is still older
- * than the original request.
- */
-void ceph_sync_write_wait(struct inode *inode)
-{
-       struct ceph_inode_info *ci = ceph_inode(inode);
-       struct list_head *head = &ci->i_unsafe_writes;
-       struct ceph_osd_request *req;
-       u64 last_tid;
-
-       if (!S_ISREG(inode->i_mode))
-               return;
-
-       spin_lock(&ci->i_unsafe_lock);
-       if (list_empty(head))
-               goto out;
-
-       /* set upper bound as _last_ entry in chain */
-
-       req = list_last_entry(head, struct ceph_osd_request,
-                             r_unsafe_item);
-       last_tid = req->r_tid;
-
-       do {
-               ceph_osdc_get_request(req);
-               spin_unlock(&ci->i_unsafe_lock);
-
-               dout("sync_write_wait on tid %llu (until %llu)\n",
-                    req->r_tid, last_tid);
-               wait_for_completion(&req->r_done_completion);
-               ceph_osdc_put_request(req);
-
-               spin_lock(&ci->i_unsafe_lock);
-               /*
-                * from here on look at first entry in chain, since we
-                * only want to wait for anything older than last_tid
-                */
-               if (list_empty(head))
-                       break;
-               req = list_first_entry(head, struct ceph_osd_request,
-                                      r_unsafe_item);
-       } while (req->r_tid < last_tid);
-out:
-       spin_unlock(&ci->i_unsafe_lock);
-}
-
 static ssize_t
 ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
                       struct ceph_snap_context *snapc,
@@ -915,9 +831,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
                if (ret2 < 0)
                        dout("invalidate_inode_pages2_range returned %d\n", ret2);
 
-               flags = CEPH_OSD_FLAG_ORDERSNAP |
-                       CEPH_OSD_FLAG_ONDISK |
-                       CEPH_OSD_FLAG_WRITE;
+               flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE;
        } else {
                flags = CEPH_OSD_FLAG_READ;
        }
@@ -1116,10 +1030,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
        if (ret < 0)
                dout("invalidate_inode_pages2_range returned %d\n", ret);
 
-       flags = CEPH_OSD_FLAG_ORDERSNAP |
-               CEPH_OSD_FLAG_ONDISK |
-               CEPH_OSD_FLAG_WRITE |
-               CEPH_OSD_FLAG_ACK;
+       flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE;
 
        while ((len = iov_iter_count(from)) > 0) {
                size_t left;
@@ -1165,8 +1076,6 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
                        goto out;
                }
 
-               /* get a second commit callback */
-               req->r_unsafe_callback = ceph_sync_write_unsafe;
                req->r_inode = inode;
 
                osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
@@ -1616,8 +1525,7 @@ static int ceph_zero_partial_object(struct inode *inode,
                                        ceph_vino(inode),
                                        offset, length,
                                        0, 1, op,
-                                       CEPH_OSD_FLAG_WRITE |
-                                       CEPH_OSD_FLAG_ONDISK,
+                                       CEPH_OSD_FLAG_WRITE,
                                        NULL, 0, 0, false);
        if (IS_ERR(req)) {
                ret = PTR_ERR(req);
index 5e659d054b40ae6faac23af26c5321c5af6ff69b..fd8f771f99b7d7c0943170df1003a1c78e423af0 100644 (file)
@@ -499,7 +499,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        ci->i_rdcache_gen = 0;
        ci->i_rdcache_revoking = 0;
 
-       INIT_LIST_HEAD(&ci->i_unsafe_writes);
        INIT_LIST_HEAD(&ci->i_unsafe_dirops);
        INIT_LIST_HEAD(&ci->i_unsafe_iops);
        spin_lock_init(&ci->i_unsafe_lock);
@@ -583,14 +582,6 @@ int ceph_drop_inode(struct inode *inode)
        return 1;
 }
 
-void ceph_evict_inode(struct inode *inode)
-{
-       /* wait unsafe sync writes */
-       ceph_sync_write_wait(inode);
-       truncate_inode_pages_final(&inode->i_data);
-       clear_inode(inode);
-}
-
 static inline blkcnt_t calc_inode_blocks(u64 size)
 {
        return (size + (1<<9) - 1) >> 9;
@@ -1016,7 +1007,9 @@ out:
 static void update_dentry_lease(struct dentry *dentry,
                                struct ceph_mds_reply_lease *lease,
                                struct ceph_mds_session *session,
-                               unsigned long from_time)
+                               unsigned long from_time,
+                               struct ceph_vino *tgt_vino,
+                               struct ceph_vino *dir_vino)
 {
        struct ceph_dentry_info *di = ceph_dentry(dentry);
        long unsigned duration = le32_to_cpu(lease->duration_ms);
@@ -1024,13 +1017,27 @@ static void update_dentry_lease(struct dentry *dentry,
        long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
        struct inode *dir;
 
+       /*
+        * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that
+        * we expect a negative dentry.
+        */
+       if (!tgt_vino && d_really_is_positive(dentry))
+               return;
+
+       if (tgt_vino && (d_really_is_negative(dentry) ||
+                       !ceph_ino_compare(d_inode(dentry), tgt_vino)))
+               return;
+
        spin_lock(&dentry->d_lock);
        dout("update_dentry_lease %p duration %lu ms ttl %lu\n",
             dentry, duration, ttl);
 
-       /* make lease_rdcache_gen match directory */
        dir = d_inode(dentry->d_parent);
 
+       /* make sure parent matches dir_vino */
+       if (!ceph_ino_compare(dir, dir_vino))
+               goto out_unlock;
+
        /* only track leases on regular dentries */
        if (ceph_snap(dir) != CEPH_NOSNAP)
                goto out_unlock;
@@ -1108,61 +1115,27 @@ out:
  *
  * Called with snap_rwsem (read).
  */
-int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
-                   struct ceph_mds_session *session)
+int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
 {
+       struct ceph_mds_session *session = req->r_session;
        struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
        struct inode *in = NULL;
-       struct ceph_vino vino;
+       struct ceph_vino tvino, dvino;
        struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
        int err = 0;
 
        dout("fill_trace %p is_dentry %d is_target %d\n", req,
             rinfo->head->is_dentry, rinfo->head->is_target);
 
-#if 0
-       /*
-        * Debugging hook:
-        *
-        * If we resend completed ops to a recovering mds, we get no
-        * trace.  Since that is very rare, pretend this is the case
-        * to ensure the 'no trace' handlers in the callers behave.
-        *
-        * Fill in inodes unconditionally to avoid breaking cap
-        * invariants.
-        */
-       if (rinfo->head->op & CEPH_MDS_OP_WRITE) {
-               pr_info("fill_trace faking empty trace on %lld %s\n",
-                       req->r_tid, ceph_mds_op_name(rinfo->head->op));
-               if (rinfo->head->is_dentry) {
-                       rinfo->head->is_dentry = 0;
-                       err = fill_inode(req->r_locked_dir,
-                                        &rinfo->diri, rinfo->dirfrag,
-                                        session, req->r_request_started, -1);
-               }
-               if (rinfo->head->is_target) {
-                       rinfo->head->is_target = 0;
-                       ininfo = rinfo->targeti.in;
-                       vino.ino = le64_to_cpu(ininfo->ino);
-                       vino.snap = le64_to_cpu(ininfo->snapid);
-                       in = ceph_get_inode(sb, vino);
-                       err = fill_inode(in, &rinfo->targeti, NULL,
-                                        session, req->r_request_started,
-                                        req->r_fmode);
-                       iput(in);
-               }
-       }
-#endif
-
        if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
                dout("fill_trace reply is empty!\n");
-               if (rinfo->head->result == 0 && req->r_locked_dir)
+               if (rinfo->head->result == 0 && req->r_parent)
                        ceph_invalidate_dir_request(req);
                return 0;
        }
 
        if (rinfo->head->is_dentry) {
-               struct inode *dir = req->r_locked_dir;
+               struct inode *dir = req->r_parent;
 
                if (dir) {
                        err = fill_inode(dir, NULL,
@@ -1188,8 +1161,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                        dname.name = rinfo->dname;
                        dname.len = rinfo->dname_len;
                        dname.hash = full_name_hash(parent, dname.name, dname.len);
-                       vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
-                       vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+                       tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+                       tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
 retry_lookup:
                        dn = d_lookup(parent, &dname);
                        dout("d_lookup on parent=%p name=%.*s got %p\n",
@@ -1206,8 +1179,8 @@ retry_lookup:
                                }
                                err = 0;
                        } else if (d_really_is_positive(dn) &&
-                                  (ceph_ino(d_inode(dn)) != vino.ino ||
-                                   ceph_snap(d_inode(dn)) != vino.snap)) {
+                                  (ceph_ino(d_inode(dn)) != tvino.ino ||
+                                   ceph_snap(d_inode(dn)) != tvino.snap)) {
                                dout(" dn %p points to wrong inode %p\n",
                                     dn, d_inode(dn));
                                d_delete(dn);
@@ -1221,10 +1194,10 @@ retry_lookup:
        }
 
        if (rinfo->head->is_target) {
-               vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
-               vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+               tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+               tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
 
-               in = ceph_get_inode(sb, vino);
+               in = ceph_get_inode(sb, tvino);
                if (IS_ERR(in)) {
                        err = PTR_ERR(in);
                        goto done;
@@ -1233,8 +1206,8 @@ retry_lookup:
 
                err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL,
                                session, req->r_request_started,
-                               (!req->r_aborted && rinfo->head->result == 0) ?
-                               req->r_fmode : -1,
+                               (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
+                               rinfo->head->result == 0) ?  req->r_fmode : -1,
                                &req->r_caps_reservation);
                if (err < 0) {
                        pr_err("fill_inode badness %p %llx.%llx\n",
@@ -1247,8 +1220,9 @@ retry_lookup:
         * ignore null lease/binding on snapdir ENOENT, or else we
         * will have trouble splicing in the virtual snapdir later
         */
-       if (rinfo->head->is_dentry && !req->r_aborted &&
-           req->r_locked_dir &&
+       if (rinfo->head->is_dentry &&
+            !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
+           test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
            (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
                                               fsc->mount_options->snapdir_name,
                                               req->r_dentry->d_name.len))) {
@@ -1257,17 +1231,19 @@ retry_lookup:
                 * mknod symlink mkdir  : null -> new inode
                 * unlink               : linked -> null
                 */
-               struct inode *dir = req->r_locked_dir;
+               struct inode *dir = req->r_parent;
                struct dentry *dn = req->r_dentry;
                bool have_dir_cap, have_lease;
 
                BUG_ON(!dn);
                BUG_ON(!dir);
                BUG_ON(d_inode(dn->d_parent) != dir);
-               BUG_ON(ceph_ino(dir) !=
-                      le64_to_cpu(rinfo->diri.in->ino));
-               BUG_ON(ceph_snap(dir) !=
-                      le64_to_cpu(rinfo->diri.in->snapid));
+
+               dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
+               dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
+
+               BUG_ON(ceph_ino(dir) != dvino.ino);
+               BUG_ON(ceph_snap(dir) != dvino.snap);
 
                /* do we have a lease on the whole dir? */
                have_dir_cap =
@@ -1319,12 +1295,13 @@ retry_lookup:
                                ceph_dir_clear_ordered(dir);
                                dout("d_delete %p\n", dn);
                                d_delete(dn);
-                       } else {
-                               if (have_lease && d_unhashed(dn))
+                       } else if (have_lease) {
+                               if (d_unhashed(dn))
                                        d_add(dn, NULL);
                                update_dentry_lease(dn, rinfo->dlease,
                                                    session,
-                                                   req->r_request_started);
+                                                   req->r_request_started,
+                                                   NULL, &dvino);
                        }
                        goto done;
                }
@@ -1347,15 +1324,19 @@ retry_lookup:
                        have_lease = false;
                }
 
-               if (have_lease)
+               if (have_lease) {
+                       tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+                       tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
                        update_dentry_lease(dn, rinfo->dlease, session,
-                                           req->r_request_started);
+                                           req->r_request_started,
+                                           &tvino, &dvino);
+               }
                dout(" final dn %p\n", dn);
-       } else if (!req->r_aborted &&
-                  (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
-                   req->r_op == CEPH_MDS_OP_MKSNAP)) {
+       } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
+                   req->r_op == CEPH_MDS_OP_MKSNAP) &&
+                  !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
                struct dentry *dn = req->r_dentry;
-               struct inode *dir = req->r_locked_dir;
+               struct inode *dir = req->r_parent;
 
                /* fill out a snapdir LOOKUPSNAP dentry */
                BUG_ON(!dn);
@@ -1370,6 +1351,26 @@ retry_lookup:
                        goto done;
                }
                req->r_dentry = dn;  /* may have spliced */
+       } else if (rinfo->head->is_dentry) {
+               struct ceph_vino *ptvino = NULL;
+
+               if ((le32_to_cpu(rinfo->diri.in->cap.caps) & CEPH_CAP_FILE_SHARED) ||
+                   le32_to_cpu(rinfo->dlease->duration_ms)) {
+                       dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
+                       dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
+
+                       if (rinfo->head->is_target) {
+                               tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+                               tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+                               ptvino = &tvino;
+                       }
+
+                       update_dentry_lease(req->r_dentry, rinfo->dlease,
+                               session, req->r_request_started, ptvino,
+                               &dvino);
+               } else {
+                       dout("%s: no dentry lease or dir cap\n", __func__);
+               }
        }
 done:
        dout("fill_trace done err=%d\n", err);
@@ -1478,7 +1479,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
        u32 fpos_offset;
        struct ceph_readdir_cache_control cache_ctl = {};
 
-       if (req->r_aborted)
+       if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
                return readdir_prepopulate_inodes_only(req, session);
 
        if (rinfo->hash_order && req->r_path2) {
@@ -1523,14 +1524,14 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
        /* FIXME: release caps/leases if error occurs */
        for (i = 0; i < rinfo->dir_nr; i++) {
                struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
-               struct ceph_vino vino;
+               struct ceph_vino tvino, dvino;
 
                dname.name = rde->name;
                dname.len = rde->name_len;
                dname.hash = full_name_hash(parent, dname.name, dname.len);
 
-               vino.ino = le64_to_cpu(rde->inode.in->ino);
-               vino.snap = le64_to_cpu(rde->inode.in->snapid);
+               tvino.ino = le64_to_cpu(rde->inode.in->ino);
+               tvino.snap = le64_to_cpu(rde->inode.in->snapid);
 
                if (rinfo->hash_order) {
                        u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
@@ -1559,8 +1560,8 @@ retry_lookup:
                                goto out;
                        }
                } else if (d_really_is_positive(dn) &&
-                          (ceph_ino(d_inode(dn)) != vino.ino ||
-                           ceph_snap(d_inode(dn)) != vino.snap)) {
+                          (ceph_ino(d_inode(dn)) != tvino.ino ||
+                           ceph_snap(d_inode(dn)) != tvino.snap)) {
                        dout(" dn %p points to wrong inode %p\n",
                             dn, d_inode(dn));
                        d_delete(dn);
@@ -1572,7 +1573,7 @@ retry_lookup:
                if (d_really_is_positive(dn)) {
                        in = d_inode(dn);
                } else {
-                       in = ceph_get_inode(parent->d_sb, vino);
+                       in = ceph_get_inode(parent->d_sb, tvino);
                        if (IS_ERR(in)) {
                                dout("new_inode badness\n");
                                d_drop(dn);
@@ -1617,8 +1618,9 @@ retry_lookup:
 
                ceph_dentry(dn)->offset = rde->offset;
 
+               dvino = ceph_vino(d_inode(parent));
                update_dentry_lease(dn, rde->lease, req->r_session,
-                                   req->r_request_started);
+                                   req->r_request_started, &tvino, &dvino);
 
                if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
                        ret = fill_readdir_cache(d_inode(parent), dn,
@@ -1632,7 +1634,7 @@ next_item:
        }
 out:
        if (err == 0 && skipped == 0) {
-               req->r_did_prepopulate = true;
+               set_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags);
                req->r_readdir_cache_idx = cache_ctl.index;
        }
        ceph_readdir_cache_release(&cache_ctl);
@@ -1720,7 +1722,7 @@ static void ceph_invalidate_work(struct work_struct *work)
 
        mutex_lock(&ci->i_truncate_mutex);
 
-       if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+       if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
                pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n",
                                    inode, ceph_ino(inode));
                mapping_set_error(inode->i_mapping, -EIO);
index 7d752d53353a24e742fff660ab33f2435727f67e..4c9c72f26eb90c6fd3693dc8f6ad7e9eda458ca3 100644 (file)
@@ -25,7 +25,7 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
                l.stripe_count = ci->i_layout.stripe_count;
                l.object_size = ci->i_layout.object_size;
                l.data_pool = ci->i_layout.pool_id;
-               l.preferred_osd = (s32)-1;
+               l.preferred_osd = -1;
                if (copy_to_user(arg, &l, sizeof(l)))
                        return -EFAULT;
        }
@@ -97,7 +97,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
                nl.data_pool = ci->i_layout.pool_id;
 
        /* this is obsolete, and always -1 */
-       nl.preferred_osd = le64_to_cpu(-1);
+       nl.preferred_osd = -1;
 
        err = __validate_layout(mdsc, &nl);
        if (err)
index c9d2e553a6c487f01bd11ed4c7a2c15ddfcd058d..c681762d76e66be1edf7004b1de8c13568ec6022 100644 (file)
@@ -547,8 +547,8 @@ void ceph_mdsc_release_request(struct kref *kref)
                ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
                iput(req->r_inode);
        }
-       if (req->r_locked_dir)
-               ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
+       if (req->r_parent)
+               ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
        iput(req->r_target_inode);
        if (req->r_dentry)
                dput(req->r_dentry);
@@ -628,6 +628,9 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 {
        dout("__unregister_request %p tid %lld\n", req, req->r_tid);
 
+       /* Never leave an unregistered request on an unsafe list! */
+       list_del_init(&req->r_unsafe_item);
+
        if (req->r_tid == mdsc->oldest_tid) {
                struct rb_node *p = rb_next(&req->r_node);
                mdsc->oldest_tid = 0;
@@ -644,13 +647,15 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 
        erase_request(&mdsc->request_tree, req);
 
-       if (req->r_unsafe_dir && req->r_got_unsafe) {
+       if (req->r_unsafe_dir  &&
+           test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
                struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
                spin_lock(&ci->i_unsafe_lock);
                list_del_init(&req->r_unsafe_dir_item);
                spin_unlock(&ci->i_unsafe_lock);
        }
-       if (req->r_target_inode && req->r_got_unsafe) {
+       if (req->r_target_inode &&
+           test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
                struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
                spin_lock(&ci->i_unsafe_lock);
                list_del_init(&req->r_unsafe_target_item);
@@ -667,6 +672,28 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
        ceph_mdsc_put_request(req);
 }
 
+/*
+ * Walk back up the dentry tree until we hit a dentry representing a
+ * non-snapshot inode. We do this using the rcu_read_lock (which must be held
+ * when calling this) to ensure that the objects won't disappear while we're
+ * working with them. Once we hit a candidate dentry, we attempt to take a
+ * reference to it, and return that as the result.
+ */
+static struct inode *get_nonsnap_parent(struct dentry *dentry)
+{
+       struct inode *inode = NULL;
+
+       while (dentry && !IS_ROOT(dentry)) {
+               inode = d_inode_rcu(dentry);
+               if (!inode || ceph_snap(inode) == CEPH_NOSNAP)
+                       break;
+               dentry = dentry->d_parent;
+       }
+       if (inode)
+               inode = igrab(inode);
+       return inode;
+}
+
 /*
  * Choose mds to send request to next.  If there is a hint set in the
  * request (e.g., due to a prior forward hint from the mds), use that.
@@ -675,19 +702,6 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
  *
  * Called under mdsc->mutex.
  */
-static struct dentry *get_nonsnap_parent(struct dentry *dentry)
-{
-       /*
-        * we don't need to worry about protecting the d_parent access
-        * here because we never renaming inside the snapped namespace
-        * except to resplice to another snapdir, and either the old or new
-        * result is a valid result.
-        */
-       while (!IS_ROOT(dentry) && ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
-               dentry = dentry->d_parent;
-       return dentry;
-}
-
 static int __choose_mds(struct ceph_mds_client *mdsc,
                        struct ceph_mds_request *req)
 {
@@ -697,7 +711,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
        int mode = req->r_direct_mode;
        int mds = -1;
        u32 hash = req->r_direct_hash;
-       bool is_hash = req->r_direct_is_hash;
+       bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
 
        /*
         * is there a specific mds we should try?  ignore hint if we have
@@ -717,30 +731,39 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
        inode = NULL;
        if (req->r_inode) {
                inode = req->r_inode;
+               ihold(inode);
        } else if (req->r_dentry) {
                /* ignore race with rename; old or new d_parent is okay */
-               struct dentry *parent = req->r_dentry->d_parent;
-               struct inode *dir = d_inode(parent);
+               struct dentry *parent;
+               struct inode *dir;
+
+               rcu_read_lock();
+               parent = req->r_dentry->d_parent;
+               dir = req->r_parent ? : d_inode_rcu(parent);
 
-               if (dir->i_sb != mdsc->fsc->sb) {
-                       /* not this fs! */
+               if (!dir || dir->i_sb != mdsc->fsc->sb) {
+                       /*  not this fs or parent went negative */
                        inode = d_inode(req->r_dentry);
+                       if (inode)
+                               ihold(inode);
                } else if (ceph_snap(dir) != CEPH_NOSNAP) {
                        /* direct snapped/virtual snapdir requests
                         * based on parent dir inode */
-                       struct dentry *dn = get_nonsnap_parent(parent);
-                       inode = d_inode(dn);
+                       inode = get_nonsnap_parent(parent);
                        dout("__choose_mds using nonsnap parent %p\n", inode);
                } else {
                        /* dentry target */
                        inode = d_inode(req->r_dentry);
                        if (!inode || mode == USE_AUTH_MDS) {
                                /* dir + name */
-                               inode = dir;
+                               inode = igrab(dir);
                                hash = ceph_dentry_hash(dir, req->r_dentry);
                                is_hash = true;
+                       } else {
+                               ihold(inode);
                        }
                }
+               rcu_read_unlock();
        }
 
        dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
@@ -769,7 +792,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
                                     (int)r, frag.ndist);
                                if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
                                    CEPH_MDS_STATE_ACTIVE)
-                                       return mds;
+                                       goto out;
                        }
 
                        /* since this file/dir wasn't known to be
@@ -784,7 +807,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
                                     inode, ceph_vinop(inode), frag.frag, mds);
                                if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
                                    CEPH_MDS_STATE_ACTIVE)
-                                       return mds;
+                                       goto out;
                        }
                }
        }
@@ -797,6 +820,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
                cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
        if (!cap) {
                spin_unlock(&ci->i_ceph_lock);
+               iput(inode);
                goto random;
        }
        mds = cap->session->s_mds;
@@ -804,6 +828,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
             inode, ceph_vinop(inode), mds,
             cap == ci->i_auth_cap ? "auth " : "", cap);
        spin_unlock(&ci->i_ceph_lock);
+out:
+       iput(inode);
        return mds;
 
 random:
@@ -1036,7 +1062,6 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
        while (!list_empty(&session->s_unsafe)) {
                req = list_first_entry(&session->s_unsafe,
                                       struct ceph_mds_request, r_unsafe_item);
-               list_del_init(&req->r_unsafe_item);
                pr_warn_ratelimited(" dropping unsafe request %llu\n",
                                    req->r_tid);
                __unregister_request(mdsc, req);
@@ -1146,7 +1171,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
                ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
 
                if (ci->i_wrbuffer_ref > 0 &&
-                   ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
+                   READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
                        invalidate = true;
 
                while (!list_empty(&ci->i_cap_flush_list)) {
@@ -1775,18 +1800,23 @@ retry:
        return path;
 }
 
-static int build_dentry_path(struct dentry *dentry,
+static int build_dentry_path(struct dentry *dentry, struct inode *dir,
                             const char **ppath, int *ppathlen, u64 *pino,
                             int *pfreepath)
 {
        char *path;
 
-       if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_NOSNAP) {
-               *pino = ceph_ino(d_inode(dentry->d_parent));
+       rcu_read_lock();
+       if (!dir)
+               dir = d_inode_rcu(dentry->d_parent);
+       if (dir && ceph_snap(dir) == CEPH_NOSNAP) {
+               *pino = ceph_ino(dir);
+               rcu_read_unlock();
                *ppath = dentry->d_name.name;
                *ppathlen = dentry->d_name.len;
                return 0;
        }
+       rcu_read_unlock();
        path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
        if (IS_ERR(path))
                return PTR_ERR(path);
@@ -1822,8 +1852,8 @@ static int build_inode_path(struct inode *inode,
  * an explicit ino+path.
  */
 static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
-                                 const char *rpath, u64 rino,
-                                 const char **ppath, int *pathlen,
+                                 struct inode *rdiri, const char *rpath,
+                                 u64 rino, const char **ppath, int *pathlen,
                                  u64 *ino, int *freepath)
 {
        int r = 0;
@@ -1833,7 +1863,8 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
                dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
                     ceph_snap(rinode));
        } else if (rdentry) {
-               r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
+               r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino,
+                                       freepath);
                dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
                     *ppath);
        } else if (rpath || rino) {
@@ -1866,7 +1897,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
        int ret;
 
        ret = set_request_path_attr(req->r_inode, req->r_dentry,
-                             req->r_path1, req->r_ino1.ino,
+                             req->r_parent, req->r_path1, req->r_ino1.ino,
                              &path1, &pathlen1, &ino1, &freepath1);
        if (ret < 0) {
                msg = ERR_PTR(ret);
@@ -1874,6 +1905,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
        }
 
        ret = set_request_path_attr(NULL, req->r_old_dentry,
+                             req->r_old_dentry_dir,
                              req->r_path2, req->r_ino2.ino,
                              &path2, &pathlen2, &ino2, &freepath2);
        if (ret < 0) {
@@ -1927,10 +1959,13 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
                      mds, req->r_inode_drop, req->r_inode_unless, 0);
        if (req->r_dentry_drop)
                releases += ceph_encode_dentry_release(&p, req->r_dentry,
-                      mds, req->r_dentry_drop, req->r_dentry_unless);
+                               req->r_parent, mds, req->r_dentry_drop,
+                               req->r_dentry_unless);
        if (req->r_old_dentry_drop)
                releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
-                      mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
+                               req->r_old_dentry_dir, mds,
+                               req->r_old_dentry_drop,
+                               req->r_old_dentry_unless);
        if (req->r_old_inode_drop)
                releases += ceph_encode_inode_release(&p,
                      d_inode(req->r_old_dentry),
@@ -2012,7 +2047,7 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
        dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
             req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
 
-       if (req->r_got_unsafe) {
+       if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
                void *p;
                /*
                 * Replay.  Do not regenerate message (and rebuild
@@ -2061,16 +2096,16 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
 
        rhead = msg->front.iov_base;
        rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
-       if (req->r_got_unsafe)
+       if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
                flags |= CEPH_MDS_FLAG_REPLAY;
-       if (req->r_locked_dir)
+       if (req->r_parent)
                flags |= CEPH_MDS_FLAG_WANT_DENTRY;
        rhead->flags = cpu_to_le32(flags);
        rhead->num_fwd = req->r_num_fwd;
        rhead->num_retry = req->r_attempts - 1;
        rhead->ino = 0;
 
-       dout(" r_locked_dir = %p\n", req->r_locked_dir);
+       dout(" r_parent = %p\n", req->r_parent);
        return 0;
 }
 
@@ -2084,8 +2119,8 @@ static int __do_request(struct ceph_mds_client *mdsc,
        int mds = -1;
        int err = 0;
 
-       if (req->r_err || req->r_got_result) {
-               if (req->r_aborted)
+       if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
+               if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
                        __unregister_request(mdsc, req);
                goto out;
        }
@@ -2096,12 +2131,12 @@ static int __do_request(struct ceph_mds_client *mdsc,
                err = -EIO;
                goto finish;
        }
-       if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+       if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
                dout("do_request forced umount\n");
                err = -EIO;
                goto finish;
        }
-       if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) {
+       if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) {
                if (mdsc->mdsmap_err) {
                        err = mdsc->mdsmap_err;
                        dout("do_request mdsmap err %d\n", err);
@@ -2215,7 +2250,7 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
        while (p) {
                req = rb_entry(p, struct ceph_mds_request, r_node);
                p = rb_next(p);
-               if (req->r_got_unsafe)
+               if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
                        continue;
                if (req->r_attempts > 0)
                        continue; /* only new requests */
@@ -2250,11 +2285,11 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 
        dout("do_request on %p\n", req);
 
-       /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */
+       /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
        if (req->r_inode)
                ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
-       if (req->r_locked_dir)
-               ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
+       if (req->r_parent)
+               ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
        if (req->r_old_dentry_dir)
                ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
                                  CEPH_CAP_PIN);
@@ -2289,7 +2324,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
        mutex_lock(&mdsc->mutex);
 
        /* only abort if we didn't race with a real reply */
-       if (req->r_got_result) {
+       if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
                err = le32_to_cpu(req->r_reply_info.head->result);
        } else if (err < 0) {
                dout("aborted request %lld with %d\n", req->r_tid, err);
@@ -2301,10 +2336,10 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
                 */
                mutex_lock(&req->r_fill_mutex);
                req->r_err = err;
-               req->r_aborted = true;
+               set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
                mutex_unlock(&req->r_fill_mutex);
 
-               if (req->r_locked_dir &&
+               if (req->r_parent &&
                    (req->r_op & CEPH_MDS_OP_WRITE))
                        ceph_invalidate_dir_request(req);
        } else {
@@ -2323,7 +2358,7 @@ out:
  */
 void ceph_invalidate_dir_request(struct ceph_mds_request *req)
 {
-       struct inode *inode = req->r_locked_dir;
+       struct inode *inode = req->r_parent;
 
        dout("invalidate_dir_request %p (complete, lease(s))\n", inode);
 
@@ -2379,14 +2414,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        }
 
        /* dup? */
-       if ((req->r_got_unsafe && !head->safe) ||
-           (req->r_got_safe && head->safe)) {
+       if ((test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags) && !head->safe) ||
+           (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags) && head->safe)) {
                pr_warn("got a dup %s reply on %llu from mds%d\n",
                           head->safe ? "safe" : "unsafe", tid, mds);
                mutex_unlock(&mdsc->mutex);
                goto out;
        }
-       if (req->r_got_safe) {
+       if (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags)) {
                pr_warn("got unsafe after safe on %llu from mds%d\n",
                           tid, mds);
                mutex_unlock(&mdsc->mutex);
@@ -2425,10 +2460,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 
 
        if (head->safe) {
-               req->r_got_safe = true;
+               set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags);
                __unregister_request(mdsc, req);
 
-               if (req->r_got_unsafe) {
+               if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
                        /*
                         * We already handled the unsafe response, now do the
                         * cleanup.  No need to examine the response; the MDS
@@ -2437,7 +2472,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
                         * useful we could do with a revised return value.
                         */
                        dout("got safe reply %llu, mds%d\n", tid, mds);
-                       list_del_init(&req->r_unsafe_item);
 
                        /* last unsafe request during umount? */
                        if (mdsc->stopping && !__get_oldest_req(mdsc))
@@ -2446,7 +2480,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
                        goto out;
                }
        } else {
-               req->r_got_unsafe = true;
+               set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags);
                list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
                if (req->r_unsafe_dir) {
                        struct ceph_inode_info *ci =
@@ -2486,7 +2520,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        /* insert trace into our cache */
        mutex_lock(&req->r_fill_mutex);
        current->journal_info = req;
-       err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
+       err = ceph_fill_trace(mdsc->fsc->sb, req);
        if (err == 0) {
                if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
                                    req->r_op == CEPH_MDS_OP_LSSNAP))
@@ -2500,7 +2534,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        if (realm)
                ceph_put_snap_realm(mdsc, realm);
 
-       if (err == 0 && req->r_got_unsafe && req->r_target_inode) {
+       if (err == 0 && req->r_target_inode &&
+           test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
                struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
                spin_lock(&ci->i_unsafe_lock);
                list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops);
@@ -2508,12 +2543,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        }
 out_err:
        mutex_lock(&mdsc->mutex);
-       if (!req->r_aborted) {
+       if (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
                if (err) {
                        req->r_err = err;
                } else {
                        req->r_reply =  ceph_msg_get(msg);
-                       req->r_got_result = true;
+                       set_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags);
                }
        } else {
                dout("reply arrived after request %lld was aborted\n", tid);
@@ -2557,7 +2592,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
                goto out;  /* dup reply? */
        }
 
-       if (req->r_aborted) {
+       if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
                dout("forward tid %llu aborted, unregistering\n", tid);
                __unregister_request(mdsc, req);
        } else if (fwd_seq <= req->r_num_fwd) {
@@ -2567,7 +2602,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
                /* resend. forward race not possible; mds would drop */
                dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
                BUG_ON(req->r_err);
-               BUG_ON(req->r_got_result);
+               BUG_ON(test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags));
                req->r_attempts = 0;
                req->r_num_fwd = fwd_seq;
                req->r_resend_mds = next_mds;
@@ -2732,7 +2767,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
        while (p) {
                req = rb_entry(p, struct ceph_mds_request, r_node);
                p = rb_next(p);
-               if (req->r_got_unsafe)
+               if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
                        continue;
                if (req->r_attempts == 0)
                        continue; /* only old requests */
@@ -3556,7 +3591,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 {
        u64 want_tid, want_flush;
 
-       if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
+       if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
                return;
 
        dout("sync\n");
@@ -3587,7 +3622,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
  */
 static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
 {
-       if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
+       if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
                return true;
        return atomic_read(&mdsc->num_sessions) <= skipped;
 }
index 3c6f77b7bb02107f9edc579fd5dc12ce57c8b7f5..ac0475a2daa749d3d689956cc45a2913c955ca8f 100644 (file)
@@ -202,9 +202,18 @@ struct ceph_mds_request {
        char *r_path1, *r_path2;
        struct ceph_vino r_ino1, r_ino2;
 
-       struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
+       struct inode *r_parent;             /* parent dir inode */
        struct inode *r_target_inode;       /* resulting inode */
 
+#define CEPH_MDS_R_DIRECT_IS_HASH      (1) /* r_direct_hash is valid */
+#define CEPH_MDS_R_ABORTED             (2) /* call was aborted */
+#define CEPH_MDS_R_GOT_UNSAFE          (3) /* got an unsafe reply */
+#define CEPH_MDS_R_GOT_SAFE            (4) /* got a safe reply */
+#define CEPH_MDS_R_GOT_RESULT          (5) /* got a result */
+#define CEPH_MDS_R_DID_PREPOPULATE     (6) /* prepopulated readdir */
+#define CEPH_MDS_R_PARENT_LOCKED       (7) /* is r_parent->i_rwsem wlocked? */
+       unsigned long   r_req_flags;
+
        struct mutex r_fill_mutex;
 
        union ceph_mds_request_args r_args;
@@ -216,7 +225,6 @@ struct ceph_mds_request {
        /* for choosing which mds to send this request to */
        int r_direct_mode;
        u32 r_direct_hash;      /* choose dir frag based on this dentry hash */
-       bool r_direct_is_hash;  /* true if r_direct_hash is valid */
 
        /* data payload is used for xattr ops */
        struct ceph_pagelist *r_pagelist;
@@ -234,7 +242,6 @@ struct ceph_mds_request {
        struct ceph_mds_reply_info_parsed r_reply_info;
        struct page *r_locked_page;
        int r_err;
-       bool r_aborted;
 
        unsigned long r_timeout;  /* optional.  jiffies, 0 is "wait forever" */
        unsigned long r_started;  /* start time to measure timeout against */
@@ -262,9 +269,7 @@ struct ceph_mds_request {
        ceph_mds_request_callback_t r_callback;
        ceph_mds_request_wait_callback_t r_wait_for_completion;
        struct list_head  r_unsafe_item;  /* per-session unsafe list item */
-       bool              r_got_unsafe, r_got_safe, r_got_result;
 
-       bool              r_did_prepopulate;
        long long         r_dir_release_cnt;
        long long         r_dir_ordered_cnt;
        int               r_readdir_cache_idx;
index 6bd20d707bfd885aff2f89a4b7266cc1c05fd5c8..0ec8d0114e57ba80fdc46b1acdc9b7de7373e276 100644 (file)
@@ -757,7 +757,6 @@ static const struct super_operations ceph_super_ops = {
        .destroy_inode  = ceph_destroy_inode,
        .write_inode    = ceph_write_inode,
        .drop_inode     = ceph_drop_inode,
-       .evict_inode    = ceph_evict_inode,
        .sync_fs        = ceph_sync_fs,
        .put_super      = ceph_put_super,
        .show_options   = ceph_show_options,
@@ -952,6 +951,14 @@ static int ceph_register_bdi(struct super_block *sb,
                fsc->backing_dev_info.ra_pages =
                        VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
 
+       if (fsc->mount_options->rsize > fsc->mount_options->rasize &&
+           fsc->mount_options->rsize >= PAGE_SIZE)
+               fsc->backing_dev_info.io_pages =
+                       (fsc->mount_options->rsize + PAGE_SIZE - 1)
+                       >> PAGE_SHIFT;
+       else if (fsc->mount_options->rsize == 0)
+               fsc->backing_dev_info.io_pages = ULONG_MAX;
+
        err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
                           atomic_long_inc_return(&bdi_seq));
        if (!err)
index 3373b61faefd0fac7d240438e5bb2dca7e3433db..e9410bcf41135b72d6a782c9d5dbf1df29bcd911 100644 (file)
@@ -45,8 +45,8 @@
 #define ceph_test_mount_opt(fsc, opt) \
        (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
 
-#define CEPH_RSIZE_DEFAULT             0           /* max read size */
-#define CEPH_RASIZE_DEFAULT            (8192*1024) /* readahead */
+#define CEPH_RSIZE_DEFAULT              (64*1024*1024) /* max read size */
+#define CEPH_RASIZE_DEFAULT             (8192*1024)    /* max readahead */
 #define CEPH_MAX_READDIR_DEFAULT        1024
 #define CEPH_MAX_READDIR_BYTES_DEFAULT  (512*1024)
 #define CEPH_SNAPDIRNAME_DEFAULT        ".snap"
@@ -343,7 +343,6 @@ struct ceph_inode_info {
        u32 i_rdcache_gen;      /* incremented each time we get FILE_CACHE. */
        u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
 
-       struct list_head i_unsafe_writes; /* uncommitted sync writes */
        struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
        struct list_head i_unsafe_iops;   /* uncommitted mds inode ops */
        spinlock_t i_unsafe_lock;
@@ -602,7 +601,7 @@ static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
 }
 
 /* what the mds thinks we want */
-extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
+extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check);
 
 extern void ceph_caps_init(struct ceph_mds_client *mdsc);
 extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
@@ -753,7 +752,6 @@ extern const struct inode_operations ceph_file_iops;
 extern struct inode *ceph_alloc_inode(struct super_block *sb);
 extern void ceph_destroy_inode(struct inode *inode);
 extern int ceph_drop_inode(struct inode *inode);
-extern void ceph_evict_inode(struct inode *inode);
 
 extern struct inode *ceph_get_inode(struct super_block *sb,
                                    struct ceph_vino vino);
@@ -764,8 +762,7 @@ extern void ceph_fill_file_time(struct inode *inode, int issued,
                                u64 time_warp_seq, struct timespec *ctime,
                                struct timespec *mtime, struct timespec *atime);
 extern int ceph_fill_trace(struct super_block *sb,
-                          struct ceph_mds_request *req,
-                          struct ceph_mds_session *session);
+                          struct ceph_mds_request *req);
 extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
                                    struct ceph_mds_session *session);
 
@@ -904,6 +901,7 @@ extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
 extern int ceph_encode_inode_release(void **p, struct inode *inode,
                                     int mds, int drop, int unless, int force);
 extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
+                                     struct inode *dir,
                                      int mds, int drop, int unless);
 
 extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
@@ -933,7 +931,7 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 extern int ceph_release(struct inode *inode, struct file *filp);
 extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
                                  char *data, size_t len);
-extern void ceph_sync_write_wait(struct inode *inode);
+
 /* dir.c */
 extern const struct file_operations ceph_dir_fops;
 extern const struct file_operations ceph_snapdir_fops;
index 1c13dd80744ff99cc0691476c3a2920eca9757cc..7e4ea3b9f4724f2b62f2aa7fe5d89844d07812cd 100644 (file)
@@ -322,6 +322,8 @@ static int lockd_inet6addr_event(struct notifier_block *this,
                dprintk("lockd_inet6addr_event: removed %pI6\n", &ifa->addr);
                sin6.sin6_family = AF_INET6;
                sin6.sin6_addr = ifa->addr;
+               if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL)
+                       sin6.sin6_scope_id = ifa->idev->dev->ifindex;
                svc_age_temp_xprts_now(nlmsvc_rqst->rq_server,
                        (struct sockaddr *)&sin6);
        }
index eb094c6011d85bb7ce7bd544a3d203defd50b03a..fd0284c1dc328b92520aa0c39b4ca2a4b9899915 100644 (file)
@@ -1083,7 +1083,8 @@ struct svc_version nfs4_callback_version1 = {
        .vs_proc = nfs4_callback_procedures1,
        .vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
        .vs_dispatch = NULL,
-       .vs_hidden = 1,
+       .vs_hidden = true,
+       .vs_need_cong_ctrl = true,
 };
 
 struct svc_version nfs4_callback_version4 = {
@@ -1092,5 +1093,6 @@ struct svc_version nfs4_callback_version4 = {
        .vs_proc = nfs4_callback_procedures1,
        .vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
        .vs_dispatch = NULL,
-       .vs_hidden = 1,
+       .vs_hidden = true,
+       .vs_need_cong_ctrl = true,
 };
index 43e109cc0ccc39e8293a7c8926bcb1c105951714..e71f11b1a180c4c0ff0d3ea30d21b568e3c11511 100644 (file)
@@ -1102,6 +1102,7 @@ static struct flags {
        { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}},
        { NFSEXP_V4ROOT, {"v4root", ""}},
        { NFSEXP_PNFS, {"pnfs", ""}},
+       { NFSEXP_SECURITY_LABEL, {"security_label", ""}},
        { 0, {"", ""}}
 };
 
index d08cd88155c75278c4607f49c078622bf87ab5ee..838f90f3f890a00f0f0989e5c3abb79e20b273d0 100644 (file)
@@ -376,5 +376,4 @@ struct svc_version  nfsd_acl_version2 = {
                .vs_proc        = nfsd_acl_procedures2,
                .vs_dispatch    = nfsd_dispatch,
                .vs_xdrsize     = NFS3_SVC_XDRSIZE,
-               .vs_hidden      = 0,
 };
index 0c890347cde3d9559b0b0103c2c2c11825d51fae..dcb5f79076c0cb3cb12400575cb7e3d3cfa9e26d 100644 (file)
@@ -266,6 +266,5 @@ struct svc_version  nfsd_acl_version3 = {
                .vs_proc        = nfsd_acl_procedures3,
                .vs_dispatch    = nfsd_dispatch,
                .vs_xdrsize     = NFS3_SVC_XDRSIZE,
-               .vs_hidden      = 0,
 };
 
index d818e4ffd79f9acd01c5f08384cd99a1bfad7243..045c9081eabeb0242a0f60d49ec9177dc9c0c6f4 100644 (file)
@@ -193,11 +193,9 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
 
        fh_copy(&resp->fh, &argp->fh);
        resp->committed = argp->stable;
-       nfserr = nfsd_write(rqstp, &resp->fh, NULL,
-                                  argp->offset,
-                                  rqstp->rq_vec, argp->vlen,
-                                  &cnt,
-                                  &resp->committed);
+       nfserr = nfsd_write(rqstp, &resp->fh, argp->offset,
+                               rqstp->rq_vec, argp->vlen,
+                               &cnt, resp->committed);
        resp->count = cnt;
        RETURN_STATUS(nfserr);
 }
index eb78109d666c1a4d8cc62fea22919f824ae93024..0274db6e65d0d6775d0b6c9c9e72e2f0c6a5fa57 100644 (file)
@@ -303,6 +303,7 @@ static int decode_cb_compound4res(struct xdr_stream *xdr,
        p = xdr_inline_decode(xdr, length + 4);
        if (unlikely(p == NULL))
                goto out_overflow;
+       p += XDR_QUADLEN(length);
        hdr->nops = be32_to_cpup(p);
        return 0;
 out_overflow:
@@ -396,13 +397,10 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
                                    struct nfsd4_callback *cb)
 {
        struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
-       struct nfs4_sessionid id;
-       int status;
+       int status = -ESERVERFAULT;
        __be32 *p;
        u32 dummy;
 
-       status = -ESERVERFAULT;
-
        /*
         * If the server returns different values for sessionID, slotID or
         * sequence number, the server is looney tunes.
@@ -410,9 +408,8 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
        p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4 + 4 + 4);
        if (unlikely(p == NULL))
                goto out_overflow;
-       memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
-       if (memcmp(id.data, session->se_sessionid.data,
-                                       NFS4_MAX_SESSIONID_LEN) != 0) {
+
+       if (memcmp(p, session->se_sessionid.data, NFS4_MAX_SESSIONID_LEN)) {
                dprintk("NFS: %s Invalid session id\n", __func__);
                goto out;
        }
@@ -753,6 +750,14 @@ int set_callback_cred(void)
        return 0;
 }
 
+void cleanup_callback_cred(void)
+{
+       if (callback_cred) {
+               put_rpccred(callback_cred);
+               callback_cred = NULL;
+       }
+}
+
 static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses)
 {
        if (clp->cl_minorversion == 0) {
index 5b20577dcdd233162d8030003758274d7619d038..6b9b6cca469f427fed55ec5d892141e38be23eb4 100644 (file)
@@ -628,6 +628,10 @@ nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen,
 {
        __be32 status;
        u32 id = -1;
+
+       if (name == NULL || namelen == 0)
+               return nfserr_inval;
+
        status = do_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, &id);
        *uid = make_kuid(&init_user_ns, id);
        if (!uid_valid(*uid))
@@ -641,6 +645,10 @@ nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen,
 {
        __be32 status;
        u32 id = -1;
+
+       if (name == NULL || namelen == 0)
+               return nfserr_inval;
+
        status = do_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, &id);
        *gid = make_kgid(&init_user_ns, id);
        if (!gid_valid(*gid))
index 74a6e573e061afa73fba49d8c65a09b7d470229d..cbeeda1e94a2fbbba61e2adeeb4f9ba89287eaf9 100644 (file)
@@ -95,11 +95,15 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                   u32 *bmval, u32 *writable)
 {
        struct dentry *dentry = cstate->current_fh.fh_dentry;
+       struct svc_export *exp = cstate->current_fh.fh_export;
 
        if (!nfsd_attrs_supported(cstate->minorversion, bmval))
                return nfserr_attrnotsupp;
        if ((bmval[0] & FATTR4_WORD0_ACL) && !IS_POSIXACL(d_inode(dentry)))
                return nfserr_attrnotsupp;
+       if ((bmval[2] & FATTR4_WORD2_SECURITY_LABEL) &&
+                       !(exp->ex_flags & NFSEXP_SECURITY_LABEL))
+               return nfserr_attrnotsupp;
        if (writable && !bmval_is_subset(bmval, writable))
                return nfserr_inval;
        if (writable && (bmval[2] & FATTR4_WORD2_MODE_UMASK) &&
@@ -983,7 +987,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
        status = nfsd_vfs_write(rqstp, &cstate->current_fh, filp,
                                write->wr_offset, rqstp->rq_vec, nvecs, &cnt,
-                               &write->wr_how_written);
+                               write->wr_how_written);
        fput(filp);
 
        write->wr_bytes_written = cnt;
@@ -1838,6 +1842,12 @@ static inline u32 nfsd4_status_stateid_rsize(struct svc_rqst *rqstp, struct nfsd
        return (op_encode_hdr_size + op_encode_stateid_maxsz)* sizeof(__be32);
 }
 
+static inline u32 nfsd4_access_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+       /* ac_supported, ac_resp_access */
+       return (op_encode_hdr_size + 2)* sizeof(__be32);
+}
+
 static inline u32 nfsd4_commit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 {
        return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32);
@@ -1892,6 +1902,11 @@ static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp,
        return ret;
 }
 
+static inline u32 nfsd4_getfh_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+       return (op_encode_hdr_size + 1) * sizeof(__be32) + NFS4_FHSIZE;
+}
+
 static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 {
        return (op_encode_hdr_size + op_encode_change_info_maxsz)
@@ -1933,6 +1948,11 @@ static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *o
                XDR_QUADLEN(rlen)) * sizeof(__be32);
 }
 
+static inline u32 nfsd4_readlink_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+       return (op_encode_hdr_size + 1) * sizeof(__be32) + PAGE_SIZE;
+}
+
 static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 {
        return (op_encode_hdr_size + op_encode_change_info_maxsz)
@@ -1952,11 +1972,23 @@ static inline u32 nfsd4_sequence_rsize(struct svc_rqst *rqstp,
                + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) * sizeof(__be32);
 }
 
+static inline u32 nfsd4_test_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+       return (op_encode_hdr_size + 1 + op->u.test_stateid.ts_num_ids)
+               * sizeof(__be32);
+}
+
 static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 {
        return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
 }
 
+static inline u32 nfsd4_secinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+       return (op_encode_hdr_size + RPC_AUTH_MAXFLAVOR *
+               (4 + XDR_QUADLEN(GSS_OID_MAX_LEN))) * sizeof(__be32);
+}
+
 static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 {
        return (op_encode_hdr_size + 2 + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) *
@@ -2011,6 +2043,19 @@ static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 }
 
 #ifdef CONFIG_NFSD_PNFS
+static inline u32 nfsd4_getdeviceinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+       u32 maxcount = 0, rlen = 0;
+
+       maxcount = svc_max_payload(rqstp);
+       rlen = min(op->u.getdeviceinfo.gd_maxcount, maxcount);
+
+       return (op_encode_hdr_size +
+               1 /* gd_layout_type*/ +
+               XDR_QUADLEN(rlen) +
+               2 /* gd_notify_types */) * sizeof(__be32);
+}
+
 /*
  * At this stage we don't really know what layout driver will handle the request,
  * so we need to define an arbitrary upper bound here.
@@ -2040,10 +2085,17 @@ static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_
 }
 #endif /* CONFIG_NFSD_PNFS */
 
+
+static inline u32 nfsd4_seek_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+       return (op_encode_hdr_size + 3) * sizeof(__be32);
+}
+
 static struct nfsd4_operation nfsd4_ops[] = {
        [OP_ACCESS] = {
                .op_func = (nfsd4op_func)nfsd4_access,
                .op_name = "OP_ACCESS",
+               .op_rsize_bop = (nfsd4op_rsize)nfsd4_access_rsize,
        },
        [OP_CLOSE] = {
                .op_func = (nfsd4op_func)nfsd4_close,
@@ -2081,6 +2133,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
        [OP_GETFH] = {
                .op_func = (nfsd4op_func)nfsd4_getfh,
                .op_name = "OP_GETFH",
+               .op_rsize_bop = (nfsd4op_rsize)nfsd4_getfh_rsize,
        },
        [OP_LINK] = {
                .op_func = (nfsd4op_func)nfsd4_link,
@@ -2099,6 +2152,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
        [OP_LOCKT] = {
                .op_func = (nfsd4op_func)nfsd4_lockt,
                .op_name = "OP_LOCKT",
+               .op_rsize_bop = (nfsd4op_rsize)nfsd4_lock_rsize,
        },
        [OP_LOCKU] = {
                .op_func = (nfsd4op_func)nfsd4_locku,
@@ -2111,15 +2165,18 @@ static struct nfsd4_operation nfsd4_ops[] = {
                .op_func = (nfsd4op_func)nfsd4_lookup,
                .op_flags = OP_HANDLES_WRONGSEC | OP_CLEAR_STATEID,
                .op_name = "OP_LOOKUP",
+               .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
        },
        [OP_LOOKUPP] = {
                .op_func = (nfsd4op_func)nfsd4_lookupp,
                .op_flags = OP_HANDLES_WRONGSEC | OP_CLEAR_STATEID,
                .op_name = "OP_LOOKUPP",
+               .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
        },
        [OP_NVERIFY] = {
                .op_func = (nfsd4op_func)nfsd4_nverify,
                .op_name = "OP_NVERIFY",
+               .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
        },
        [OP_OPEN] = {
                .op_func = (nfsd4op_func)nfsd4_open,
@@ -2177,6 +2234,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
        [OP_READLINK] = {
                .op_func = (nfsd4op_func)nfsd4_readlink,
                .op_name = "OP_READLINK",
+               .op_rsize_bop = (nfsd4op_rsize)nfsd4_readlink_rsize,
        },
        [OP_REMOVE] = {
                .op_func = (nfsd4op_func)nfsd4_remove,
@@ -2215,6 +2273,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
                .op_func = (nfsd4op_func)nfsd4_secinfo,
                .op_flags = OP_HANDLES_WRONGSEC,
                .op_name = "OP_SECINFO",
+               .op_rsize_bop = (nfsd4op_rsize)nfsd4_secinfo_rsize,
        },
        [OP_SETATTR] = {
                .op_func = (nfsd4op_func)nfsd4_setattr,
@@ -2240,6 +2299,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
        [OP_VERIFY] = {
                .op_func = (nfsd4op_func)nfsd4_verify,
                .op_name = "OP_VERIFY",
+               .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
        },
        [OP_WRITE] = {
                .op_func = (nfsd4op_func)nfsd4_write,
@@ -2314,11 +2374,13 @@ static struct nfsd4_operation nfsd4_ops[] = {
                .op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
                .op_flags = OP_HANDLES_WRONGSEC,
                .op_name = "OP_SECINFO_NO_NAME",
+               .op_rsize_bop = (nfsd4op_rsize)nfsd4_secinfo_rsize,
        },
        [OP_TEST_STATEID] = {
                .op_func = (nfsd4op_func)nfsd4_test_stateid,
                .op_flags = ALLOWED_WITHOUT_FH,
                .op_name = "OP_TEST_STATEID",
+               .op_rsize_bop = (nfsd4op_rsize)nfsd4_test_stateid_rsize,
        },
        [OP_FREE_STATEID] = {
                .op_func = (nfsd4op_func)nfsd4_free_stateid,
@@ -2332,6 +2394,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
                .op_func = (nfsd4op_func)nfsd4_getdeviceinfo,
                .op_flags = ALLOWED_WITHOUT_FH,
                .op_name = "OP_GETDEVICEINFO",
+               .op_rsize_bop = (nfsd4op_rsize)nfsd4_getdeviceinfo_rsize,
        },
        [OP_LAYOUTGET] = {
                .op_func = (nfsd4op_func)nfsd4_layoutget,
@@ -2381,6 +2444,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
        [OP_SEEK] = {
                .op_func = (nfsd4op_func)nfsd4_seek,
                .op_name = "OP_SEEK",
+               .op_rsize_bop = (nfsd4op_rsize)nfsd4_seek_rsize,
        },
 };
 
@@ -2425,14 +2489,11 @@ bool nfsd4_spo_must_allow(struct svc_rqst *rqstp)
 
 int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op)
 {
-       struct nfsd4_operation *opdesc;
-       nfsd4op_rsize estimator;
-
        if (op->opnum == OP_ILLEGAL)
                return op_encode_hdr_size * sizeof(__be32);
-       opdesc = OPDESC(op);
-       estimator = opdesc->op_rsize_bop;
-       return estimator ? estimator(rqstp, op) : PAGE_SIZE;
+
+       BUG_ON(OPDESC(op)->op_rsize_bop == NULL);
+       return OPDESC(op)->op_rsize_bop(rqstp, op);
 }
 
 void warn_on_nonidempotent_op(struct nfsd4_op *op)
@@ -2476,12 +2537,13 @@ static struct svc_procedure             nfsd_procedures4[2] = {
 };
 
 struct svc_version     nfsd_version4 = {
-               .vs_vers        = 4,
-               .vs_nproc       = 2,
-               .vs_proc        = nfsd_procedures4,
-               .vs_dispatch    = nfsd_dispatch,
-               .vs_xdrsize     = NFS4_SVC_XDRSIZE,
-               .vs_rpcb_optnl  = 1,
+       .vs_vers                = 4,
+       .vs_nproc               = 2,
+       .vs_proc                = nfsd_procedures4,
+       .vs_dispatch            = nfsd_dispatch,
+       .vs_xdrsize             = NFS4_SVC_XDRSIZE,
+       .vs_rpcb_optnl          = true,
+       .vs_need_cong_ctrl      = true,
 };
 
 /*
index a0dee8ae9f97f16a18e40ba19f8e84a45ad1a02b..e9ef50addddb4489534bc07f138cd8c321d9193d 100644 (file)
@@ -2281,7 +2281,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r
 out_err:
        conn->cb_addr.ss_family = AF_UNSPEC;
        conn->cb_addrlen = 0;
-       dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) "
+       dprintk("NFSD: this client (clientid %08x/%08x) "
                "will not receive delegations\n",
                clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
 
@@ -7012,23 +7012,24 @@ nfs4_state_start(void)
 
        ret = set_callback_cred();
        if (ret)
-               return -ENOMEM;
+               return ret;
+
        laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4");
        if (laundry_wq == NULL) {
                ret = -ENOMEM;
-               goto out_recovery;
+               goto out_cleanup_cred;
        }
        ret = nfsd4_create_callback_queue();
        if (ret)
                goto out_free_laundry;
 
        set_max_delegations();
-
        return 0;
 
 out_free_laundry:
        destroy_workqueue(laundry_wq);
-out_recovery:
+out_cleanup_cred:
+       cleanup_callback_cred();
        return ret;
 }
 
@@ -7086,6 +7087,7 @@ nfs4_state_shutdown(void)
 {
        destroy_workqueue(laundry_wq);
        nfsd4_destroy_callback_queue();
+       cleanup_callback_cred();
 }
 
 static void
index 8fae53ce21d16c8406ff01425d924eb044edee34..382c1fd05b4c8dfe2973d466bae01d6963eb7c43 100644 (file)
@@ -58,7 +58,7 @@
 
 #define NFSDDBG_FACILITY               NFSDDBG_XDR
 
-u32 nfsd_suppattrs[3][3] = {
+const u32 nfsd_suppattrs[3][3] = {
        {NFSD4_SUPPORTED_ATTRS_WORD0,
         NFSD4_SUPPORTED_ATTRS_WORD1,
         NFSD4_SUPPORTED_ATTRS_WORD2},
@@ -1250,7 +1250,7 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
        READ_BUF(16);
        p = xdr_decode_hyper(p, &write->wr_offset);
        write->wr_stable_how = be32_to_cpup(p++);
-       if (write->wr_stable_how > 2)
+       if (write->wr_stable_how > NFS_FILE_SYNC)
                goto xdr_error;
        write->wr_buflen = be32_to_cpup(p++);
 
@@ -1941,12 +1941,12 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
                } else
                        max_reply += nfsd4_max_reply(argp->rqstp, op);
                /*
-                * OP_LOCK may return a conflicting lock.  (Special case
-                * because it will just skip encoding this if it runs
-                * out of xdr buffer space, and it is the only operation
-                * that behaves this way.)
+                * OP_LOCK and OP_LOCKT may return a conflicting lock.
+                * (Special case because it will just skip encoding this
+                * if it runs out of xdr buffer space, and it is the only
+                * operation that behaves this way.)
                 */
-               if (op->opnum == OP_LOCK)
+               if (op->opnum == OP_LOCK || op->opnum == OP_LOCKT)
                        max_reply += NFS4_OPAQUE_LIMIT;
 
                if (op->status) {
@@ -1966,9 +1966,13 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
        DECODE_TAIL;
 }
 
-static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode)
+static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
+                            struct svc_export *exp)
 {
-       if (IS_I_VERSION(inode)) {
+       if (exp->ex_flags & NFSEXP_V4ROOT) {
+               *p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time));
+               *p++ = 0;
+       } else if (IS_I_VERSION(inode)) {
                p = xdr_encode_hyper(p, inode->i_version);
        } else {
                *p++ = cpu_to_be32(stat->ctime.tv_sec);
@@ -2417,8 +2421,11 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
        if ((bmval2 & FATTR4_WORD2_SECURITY_LABEL) ||
             bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
-               err = security_inode_getsecctx(d_inode(dentry),
+               if (exp->ex_flags & NFSEXP_SECURITY_LABEL)
+                       err = security_inode_getsecctx(d_inode(dentry),
                                                &context, &contextlen);
+               else
+                       err = -EOPNOTSUPP;
                contextsupport = (err == 0);
                if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
                        if (err == -EOPNOTSUPP)
@@ -2490,7 +2497,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
                p = xdr_reserve_space(xdr, 8);
                if (!p)
                        goto out_resource;
-               p = encode_change(p, &stat, d_inode(dentry));
+               p = encode_change(p, &stat, d_inode(dentry), exp);
        }
        if (bmval0 & FATTR4_WORD0_SIZE) {
                p = xdr_reserve_space(xdr, 8);
index f3b2f34b10a3f19cd018c9fd8d176dfae44ea70b..73e75ac905258c17bdc107c0c071e8d14df739f0 100644 (file)
@@ -536,6 +536,19 @@ out_free:
        return rv;
 }
 
+static ssize_t
+nfsd_print_version_support(char *buf, int remaining, const char *sep,
+               unsigned vers, unsigned minor)
+{
+       const char *format = (minor == 0) ? "%s%c%u" : "%s%c%u.%u";
+       bool supported = !!nfsd_vers(vers, NFSD_TEST);
+
+       if (vers == 4 && !nfsd_minorversion(minor, NFSD_TEST))
+               supported = false;
+       return snprintf(buf, remaining, format, sep,
+                       supported ? '+' : '-', vers, minor);
+}
+
 static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 {
        char *mesg = buf;
@@ -561,6 +574,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
                len = qword_get(&mesg, vers, size);
                if (len <= 0) return -EINVAL;
                do {
+                       enum vers_op cmd;
                        sign = *vers;
                        if (sign == '+' || sign == '-')
                                num = simple_strtol((vers+1), &minorp, 0);
@@ -569,24 +583,22 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
                        if (*minorp == '.') {
                                if (num != 4)
                                        return -EINVAL;
-                               minor = simple_strtoul(minorp+1, NULL, 0);
-                               if (minor == 0)
-                                       return -EINVAL;
-                               if (nfsd_minorversion(minor, sign == '-' ?
-                                                    NFSD_CLEAR : NFSD_SET) < 0)
+                               if (kstrtouint(minorp+1, 0, &minor) < 0)
                                        return -EINVAL;
-                               goto next;
-                       }
+                       } else
+                               minor = 0;
+                       cmd = sign == '-' ? NFSD_CLEAR : NFSD_SET;
                        switch(num) {
                        case 2:
                        case 3:
-                       case 4:
-                               nfsd_vers(num, sign == '-' ? NFSD_CLEAR : NFSD_SET);
+                               nfsd_vers(num, cmd);
                                break;
+                       case 4:
+                               if (nfsd_minorversion(minor, cmd) >= 0)
+                                       break;
                        default:
                                return -EINVAL;
                        }
-               next:
                        vers += len + 1;
                } while ((len = qword_get(&mesg, vers, size)) > 0);
                /* If all get turned off, turn them back on, as
@@ -599,35 +611,23 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
        len = 0;
        sep = "";
        remaining = SIMPLE_TRANSACTION_LIMIT;
-       for (num=2 ; num <= 4 ; num++)
-               if (nfsd_vers(num, NFSD_AVAIL)) {
-                       len = snprintf(buf, remaining, "%s%c%d", sep,
-                                      nfsd_vers(num, NFSD_TEST)?'+':'-',
-                                      num);
-                       sep = " ";
-
-                       if (len >= remaining)
-                               break;
-                       remaining -= len;
-                       buf += len;
-                       tlen += len;
-               }
-       if (nfsd_vers(4, NFSD_AVAIL))
-               for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION;
-                    minor++) {
-                       len = snprintf(buf, remaining, " %c4.%u",
-                                       (nfsd_vers(4, NFSD_TEST) &&
-                                        nfsd_minorversion(minor, NFSD_TEST)) ?
-                                               '+' : '-',
-                                       minor);
-
+       for (num=2 ; num <= 4 ; num++) {
+               if (!nfsd_vers(num, NFSD_AVAIL))
+                       continue;
+               minor = 0;
+               do {
+                       len = nfsd_print_version_support(buf, remaining,
+                                       sep, num, minor);
                        if (len >= remaining)
-                               break;
+                               goto out;
                        remaining -= len;
                        buf += len;
                        tlen += len;
-               }
-
+                       minor++;
+                       sep = " ";
+               } while (num == 4 && minor <= NFSD_SUPPORTED_MINOR_VERSION);
+       }
+out:
        len = snprintf(buf, remaining, "\n");
        if (len >= remaining)
                return -EINVAL;
index d74c8c44dc3536ffdd6a93b0cd340121233e5a95..d96606801d47ae6ee9927a2991263780c00840d7 100644 (file)
@@ -362,16 +362,16 @@ void              nfsd_lockd_shutdown(void);
        FATTR4_WORD2_MODE_UMASK | \
        NFSD4_2_SECURITY_ATTRS)
 
-extern u32 nfsd_suppattrs[3][3];
+extern const u32 nfsd_suppattrs[3][3];
 
-static inline bool bmval_is_subset(u32 *bm1, u32 *bm2)
+static inline bool bmval_is_subset(const u32 *bm1, const u32 *bm2)
 {
        return !((bm1[0] & ~bm2[0]) ||
                 (bm1[1] & ~bm2[1]) ||
                 (bm1[2] & ~bm2[2]));
 }
 
-static inline bool nfsd_attrs_supported(u32 minorversion, u32 *bmval)
+static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval)
 {
        return bmval_is_subset(bmval, nfsd_suppattrs[minorversion]);
 }
index 010aff5c5a79f2e91eaefaa671f77cafdf4c1cb5..fa82b7707e8531f9b7e8065391c3f54387c2740d 100644 (file)
@@ -204,18 +204,14 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
                                        struct nfsd_attrstat  *resp)
 {
        __be32  nfserr;
-       int     stable = 1;
        unsigned long cnt = argp->len;
 
        dprintk("nfsd: WRITE    %s %d bytes at %d\n",
                SVCFH_fmt(&argp->fh),
                argp->len, argp->offset);
 
-       nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
-                                  argp->offset,
-                                  rqstp->rq_vec, argp->vlen,
-                                  &cnt,
-                                  &stable);
+       nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), argp->offset,
+                               rqstp->rq_vec, argp->vlen, &cnt, NFS_DATA_SYNC);
        return nfsd_return_attrs(nfserr, resp);
 }
 
index e6bfd96734c006587bd1709d0df48818c2065789..efd66da992010ffe5aeb877e2e6f5ab0d850bced 100644 (file)
@@ -153,6 +153,18 @@ int nfsd_vers(int vers, enum vers_op change)
        return 0;
 }
 
+static void
+nfsd_adjust_nfsd_versions4(void)
+{
+       unsigned i;
+
+       for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++) {
+               if (nfsd_supported_minorversions[i])
+                       return;
+       }
+       nfsd_vers(4, NFSD_CLEAR);
+}
+
 int nfsd_minorversion(u32 minorversion, enum vers_op change)
 {
        if (minorversion > NFSD_SUPPORTED_MINOR_VERSION)
@@ -160,9 +172,11 @@ int nfsd_minorversion(u32 minorversion, enum vers_op change)
        switch(change) {
        case NFSD_SET:
                nfsd_supported_minorversions[minorversion] = true;
+               nfsd_vers(4, NFSD_SET);
                break;
        case NFSD_CLEAR:
                nfsd_supported_minorversions[minorversion] = false;
+               nfsd_adjust_nfsd_versions4();
                break;
        case NFSD_TEST:
                return nfsd_supported_minorversions[minorversion];
@@ -354,6 +368,8 @@ static int nfsd_inet6addr_event(struct notifier_block *this,
                dprintk("nfsd_inet6addr_event: removed %pI6\n", &ifa->addr);
                sin6.sin6_family = AF_INET6;
                sin6.sin6_addr = ifa->addr;
+               if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL)
+                       sin6.sin6_scope_id = ifa->idev->dev->ifindex;
                svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6);
        }
 
index 4516e8b7d776305d94fb89f86256ee3fc54dec27..005c911b34ac4553a2c02da05b4e5d975b660710 100644 (file)
@@ -615,6 +615,7 @@ extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir,
 extern __be32 nfs4_check_open_reclaim(clientid_t *clid,
                struct nfsd4_compound_state *cstate, struct nfsd_net *nn);
 extern int set_callback_cred(void);
+extern void cleanup_callback_cred(void);
 extern void nfsd4_probe_callback(struct nfs4_client *clp);
 extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
 extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
index 26c6fdb4bf67cf1e3e3a843e8e816d7a76eae265..19d50f600e8d48c6f493130076606a6213de258d 100644 (file)
@@ -377,7 +377,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
        __be32          err;
        int             host_err;
        bool            get_write_count;
-       int             size_change = 0;
+       bool            size_change = (iap->ia_valid & ATTR_SIZE);
 
        if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
                accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
@@ -390,11 +390,11 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
        /* Get inode */
        err = fh_verify(rqstp, fhp, ftype, accmode);
        if (err)
-               goto out;
+               return err;
        if (get_write_count) {
                host_err = fh_want_write(fhp);
                if (host_err)
-                       return nfserrno(host_err);
+                       goto out;
        }
 
        dentry = fhp->fh_dentry;
@@ -405,20 +405,28 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
                iap->ia_valid &= ~ATTR_MODE;
 
        if (!iap->ia_valid)
-               goto out;
+               return 0;
 
        nfsd_sanitize_attrs(inode, iap);
 
+       if (check_guard && guardtime != inode->i_ctime.tv_sec)
+               return nfserr_notsync;
+
        /*
         * The size case is special, it changes the file in addition to the
-        * attributes.
+        * attributes, and file systems don't expect it to be mixed with
+        * "random" attribute changes.  We thus split out the size change
+        * into a separate call to ->setattr, and do the rest as a separate
+        * setattr call.
         */
-       if (iap->ia_valid & ATTR_SIZE) {
+       if (size_change) {
                err = nfsd_get_write_access(rqstp, fhp, iap);
                if (err)
-                       goto out;
-               size_change = 1;
+                       return err;
+       }
 
+       fh_lock(fhp);
+       if (size_change) {
                /*
                 * RFC5661, Section 18.30.4:
                 *   Changing the size of a file with SETATTR indirectly
@@ -426,29 +434,36 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
                 *
                 * (and similar for the older RFCs)
                 */
-               if (iap->ia_size != i_size_read(inode))
-                       iap->ia_valid |= ATTR_MTIME;
-       }
+               struct iattr size_attr = {
+                       .ia_valid       = ATTR_SIZE | ATTR_CTIME | ATTR_MTIME,
+                       .ia_size        = iap->ia_size,
+               };
 
-       iap->ia_valid |= ATTR_CTIME;
+               host_err = notify_change(dentry, &size_attr, NULL);
+               if (host_err)
+                       goto out_unlock;
+               iap->ia_valid &= ~ATTR_SIZE;
 
-       if (check_guard && guardtime != inode->i_ctime.tv_sec) {
-               err = nfserr_notsync;
-               goto out_put_write_access;
+               /*
+                * Avoid the additional setattr call below if the only other
+                * attribute that the client sends is the mtime, as we update
+                * it as part of the size change above.
+                */
+               if ((iap->ia_valid & ~ATTR_MTIME) == 0)
+                       goto out_unlock;
        }
 
-       fh_lock(fhp);
+       iap->ia_valid |= ATTR_CTIME;
        host_err = notify_change(dentry, iap, NULL);
-       fh_unlock(fhp);
-       err = nfserrno(host_err);
 
-out_put_write_access:
+out_unlock:
+       fh_unlock(fhp);
        if (size_change)
                put_write_access(inode);
-       if (!err)
-               err = nfserrno(commit_metadata(fhp));
 out:
-       return err;
+       if (!host_err)
+               host_err = commit_metadata(fhp);
+       return nfserrno(host_err);
 }
 
 #if defined(CONFIG_NFSD_V4)
@@ -940,14 +955,12 @@ static int wait_for_concurrent_writes(struct file *file)
 __be32
 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
                                loff_t offset, struct kvec *vec, int vlen,
-                               unsigned long *cnt, int *stablep)
+                               unsigned long *cnt, int stable)
 {
        struct svc_export       *exp;
-       struct inode            *inode;
        mm_segment_t            oldfs;
        __be32                  err = 0;
        int                     host_err;
-       int                     stable = *stablep;
        int                     use_wgather;
        loff_t                  pos = offset;
        unsigned int            pflags = current->flags;
@@ -962,13 +975,11 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
                 */
                current->flags |= PF_LESS_THROTTLE;
 
-       inode = file_inode(file);
-       exp   = fhp->fh_export;
-
+       exp = fhp->fh_export;
        use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp);
 
        if (!EX_ISSYNC(exp))
-               stable = 0;
+               stable = NFS_UNSTABLE;
 
        if (stable && !use_wgather)
                flags |= RWF_SYNC;
@@ -1035,35 +1046,22 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
  * N.B. After this call fhp needs an fh_put
  */
 __be32
-nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
-               loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt,
-               int *stablep)
+nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
+          struct kvec *vec, int vlen, unsigned long *cnt, int stable)
 {
-       __be32                  err = 0;
+       struct file *file = NULL;
+       __be32 err = 0;
 
        trace_write_start(rqstp, fhp, offset, vlen);
 
-       if (file) {
-               err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
-                               NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE);
-               if (err)
-                       goto out;
-               trace_write_opened(rqstp, fhp, offset, vlen);
-               err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt,
-                               stablep);
-               trace_write_io_done(rqstp, fhp, offset, vlen);
-       } else {
-               err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
-               if (err)
-                       goto out;
+       err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
+       if (err)
+               goto out;
 
-               trace_write_opened(rqstp, fhp, offset, vlen);
-               if (cnt)
-                       err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen,
-                                            cnt, stablep);
-               trace_write_io_done(rqstp, fhp, offset, vlen);
-               fput(file);
-       }
+       trace_write_opened(rqstp, fhp, offset, vlen);
+       err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stable);
+       trace_write_io_done(rqstp, fhp, offset, vlen);
+       fput(file);
 out:
        trace_write_done(rqstp, fhp, offset, vlen);
        return err;
index 0bf9e7bf5800af3855e3d93aaec194dcbea93ba6..db98c48c735aaae5a914a6e2073391ceab436a2f 100644 (file)
@@ -83,12 +83,12 @@ __be32              nfsd_readv(struct file *, loff_t, struct kvec *, int,
                                unsigned long *);
 __be32                 nfsd_read(struct svc_rqst *, struct svc_fh *,
                                loff_t, struct kvec *, int, unsigned long *);
-__be32                 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *,
-                               loff_t, struct kvec *,int, unsigned long *, int *);
+__be32                 nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t,
+                               struct kvec *, int, unsigned long *, int);
 __be32         nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
                                struct file *file, loff_t offset,
                                struct kvec *vec, int vlen, unsigned long *cnt,
-                               int *stablep);
+                               int stable);
 __be32         nfsd_readlink(struct svc_rqst *, struct svc_fh *,
                                char *, int *);
 __be32         nfsd_symlink(struct svc_rqst *, struct svc_fh *,
index 03a6653d329a01b90803d373f7a955230d787675..2ea0c282f3dc9326f7b3c4b7a3883758831ed251 100644 (file)
@@ -22,7 +22,6 @@ struct ceph_osd_client;
  * completion callback for async writepages
  */
 typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
-typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool);
 
 #define CEPH_HOMELESS_OSD      -1
 
@@ -170,15 +169,12 @@ struct ceph_osd_request {
        unsigned int            r_num_ops;
 
        int               r_result;
-       bool              r_got_reply;
 
        struct ceph_osd_client *r_osdc;
        struct kref       r_kref;
        bool              r_mempool;
-       struct completion r_completion;
-       struct completion r_done_completion;  /* fsync waiter */
+       struct completion r_completion;       /* private to osd_client.c */
        ceph_osdc_callback_t r_callback;
-       ceph_osdc_unsafe_callback_t r_unsafe_callback;
        struct list_head  r_unsafe_item;
 
        struct inode *r_inode;                /* for use by callbacks */
index 9a9041784dcff383169a169c39203dd79f383438..938656f708078e8a2fd078bc7cace623a32b755c 100644 (file)
@@ -57,7 +57,7 @@ static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
        case CEPH_POOL_TYPE_EC:
                return false;
        default:
-               BUG_ON(1);
+               BUG();
        }
 }
 
@@ -81,13 +81,6 @@ void ceph_oloc_copy(struct ceph_object_locator *dest,
                    const struct ceph_object_locator *src);
 void ceph_oloc_destroy(struct ceph_object_locator *oloc);
 
-/*
- * Maximum supported by kernel client object name length
- *
- * (probably outdated: must be >= RBD_MAX_MD_NAME_LEN -- currently 100)
- */
-#define CEPH_MAX_OID_NAME_LEN 100
-
 /*
  * 51-char inline_name is long enough for all cephfs and all but one
  * rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be
@@ -173,8 +166,8 @@ struct ceph_osdmap {
         * the list of osds that store+replicate them. */
        struct crush_map *crush;
 
-       struct mutex crush_scratch_mutex;
-       int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
+       struct mutex crush_workspace_mutex;
+       void *crush_workspace;
 };
 
 static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
index 5c0da61cb763124c651e3287f4ecaf69f8cdfbc8..5d0018782d504ce61fa3dff2ed526636302b7ef4 100644 (file)
@@ -50,7 +50,7 @@ struct ceph_timespec {
 #define CEPH_PG_LAYOUT_LINEAR 2
 #define CEPH_PG_LAYOUT_HYBRID 3
 
-#define CEPH_PG_MAX_SIZE      16  /* max # osds in a single pg */
+#define CEPH_PG_MAX_SIZE      32  /* max # osds in a single pg */
 
 /*
  * placement group.
index 811f7a915658d6c7063d07eb7bfd046db3d6ad01..76e28c22980586a6aa1cc3114b79de7cb5001fcd 100644 (file)
 #endif
 #endif
 
+#ifdef CONFIG_STACK_VALIDATION
+#define annotate_unreachable() ({                                      \
+       asm("%c0:\t\n"                                                  \
+           ".pushsection __unreachable, \"a\"\t\n"                     \
+           ".long %c0b\t\n"                                            \
+           ".popsection\t\n" : : "i" (__LINE__));                      \
+})
+#else
+#define annotate_unreachable()
+#endif
+
 /*
  * Mark a position in code as unreachable.  This can be used to
  * suppress control flow warnings after asm blocks that transfer
  * this in the preprocessor, but we can live with this because they're
  * unreleased.  Really, we need to have autoconf for the kernel.
  */
-#define unreachable() __builtin_unreachable()
+#define unreachable() \
+       do { annotate_unreachable(); __builtin_unreachable(); } while (0)
 
 /* Mark a function definition as prohibited from being cloned. */
 #define __noclone      __attribute__((__noclone__, __optimize__("no-tracer")))
index be8f12b8f1950499380c10de27ab6928df25fd8e..fbecbd089d75f4a9eead25b1439e6629893c3639 100644 (file)
@@ -135,13 +135,6 @@ struct crush_bucket {
        __u32 size;      /* num items */
        __s32 *items;
 
-       /*
-        * cached random permutation: used for uniform bucket and for
-        * the linear search fallback for the other bucket types.
-        */
-       __u32 perm_x;  /* @x for which *perm is defined */
-       __u32 perm_n;  /* num elements of *perm that are permuted/defined */
-       __u32 *perm;
 };
 
 struct crush_bucket_uniform {
@@ -211,6 +204,21 @@ struct crush_map {
         * device fails. */
        __u8 chooseleaf_stable;
 
+       /*
+        * This value is calculated after decode or construction by
+        * the builder. It is exposed here (rather than having a
+        * 'build CRUSH working space' function) so that callers can
+        * reserve a static buffer, allocate space on the stack, or
+        * otherwise avoid calling into the heap allocator if they
+        * want to. The size of the working space depends on the map,
+        * while the size of the scratch vector passed to the mapper
+        * depends on the size of the desired result set.
+        *
+        * Nothing stops the caller from allocating both in one swell
+        * foop and passing in two points, though.
+        */
+       size_t working_size;
+
 #ifndef __KERNEL__
        /*
         * version 0 (original) of straw_calc has various flaws.  version 1
@@ -248,4 +256,23 @@ static inline int crush_calc_tree_node(int i)
        return ((i+1) << 1)-1;
 }
 
+/*
+ * These data structures are private to the CRUSH implementation. They
+ * are exposed in this header file because builder needs their
+ * definitions to calculate the total working size.
+ *
+ * Moving this out of the crush map allow us to treat the CRUSH map as
+ * immutable within the mapper and removes the requirement for a CRUSH
+ * map lock.
+ */
+struct crush_work_bucket {
+       __u32 perm_x; /* @x for which *perm is defined */
+       __u32 perm_n; /* num elements of *perm that are permuted/defined */
+       __u32 *perm;  /* Permutation of the bucket's items */
+};
+
+struct crush_work {
+       struct crush_work_bucket **work; /* Per-bucket working store */
+};
+
 #endif
index 5dfd5b1125d2b257a4a00d1e77661613ca2227ec..c95e19e1ff11c5f69e4b3d05d328ce071ef74856 100644 (file)
@@ -15,6 +15,20 @@ extern int crush_do_rule(const struct crush_map *map,
                         int ruleno,
                         int x, int *result, int result_max,
                         const __u32 *weights, int weight_max,
-                        int *scratch);
+                        void *cwin);
+
+/*
+ * Returns the exact amount of workspace that will need to be used
+ * for a given combination of crush_map and result_max. The caller can
+ * then allocate this much on its own, either on the stack, in a
+ * per-thread long-lived buffer, or however it likes.
+ */
+static inline size_t crush_work_size(const struct crush_map *map,
+                                    int result_max)
+{
+       return map->working_size + result_max * 3 * sizeof(__u32);
+}
+
+void crush_init_workspace(const struct crush_map *map, void *v);
 
 #endif
index 600aadf9cca445a437f97a5c71a21febf537845f..0023fee4bbbcb4f38c4d3f6f9181e57261c4188a 100644 (file)
@@ -1,54 +1,10 @@
 #ifndef _LINUX_REFCOUNT_H
 #define _LINUX_REFCOUNT_H
 
-/*
- * Variant of atomic_t specialized for reference counts.
- *
- * The interface matches the atomic_t interface (to aid in porting) but only
- * provides the few functions one should use for reference counting.
- *
- * It differs in that the counter saturates at UINT_MAX and will not move once
- * there. This avoids wrapping the counter and causing 'spurious'
- * use-after-free issues.
- *
- * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
- * and provide only what is strictly required for refcounts.
- *
- * The increments are fully relaxed; these will not provide ordering. The
- * rationale is that whatever is used to obtain the object we're increasing the
- * reference count on will provide the ordering. For locked data structures,
- * its the lock acquire, for RCU/lockless data structures its the dependent
- * load.
- *
- * Do note that inc_not_zero() provides a control dependency which will order
- * future stores against the inc, this ensures we'll never modify the object
- * if we did not in fact acquire a reference.
- *
- * The decrements will provide release order, such that all the prior loads and
- * stores will be issued before, it also provides a control dependency, which
- * will order us against the subsequent free().
- *
- * The control dependency is against the load of the cmpxchg (ll/sc) that
- * succeeded. This means the stores aren't fully ordered, but this is fine
- * because the 1->0 transition indicates no concurrency.
- *
- * Note that the allocator is responsible for ordering things between free()
- * and alloc().
- *
- */
-
 #include <linux/atomic.h>
-#include <linux/bug.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
-
-#ifdef CONFIG_DEBUG_REFCOUNT
-#define REFCOUNT_WARN(cond, str) WARN_ON(cond)
-#define __refcount_check       __must_check
-#else
-#define REFCOUNT_WARN(cond, str) (void)(cond)
-#define __refcount_check
-#endif
+#include <linux/kernel.h>
 
 typedef struct refcount_struct {
        atomic_t refs;
@@ -66,229 +22,21 @@ static inline unsigned int refcount_read(const refcount_t *r)
        return atomic_read(&r->refs);
 }
 
-static inline __refcount_check
-bool refcount_add_not_zero(unsigned int i, refcount_t *r)
-{
-       unsigned int old, new, val = atomic_read(&r->refs);
-
-       for (;;) {
-               if (!val)
-                       return false;
-
-               if (unlikely(val == UINT_MAX))
-                       return true;
-
-               new = val + i;
-               if (new < val)
-                       new = UINT_MAX;
-               old = atomic_cmpxchg_relaxed(&r->refs, val, new);
-               if (old == val)
-                       break;
-
-               val = old;
-       }
-
-       REFCOUNT_WARN(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n");
-
-       return true;
-}
-
-static inline void refcount_add(unsigned int i, refcount_t *r)
-{
-       REFCOUNT_WARN(!refcount_add_not_zero(i, r), "refcount_t: addition on 0; use-after-free.\n");
-}
-
-/*
- * Similar to atomic_inc_not_zero(), will saturate at UINT_MAX and WARN.
- *
- * Provides no memory ordering, it is assumed the caller has guaranteed the
- * object memory to be stable (RCU, etc.). It does provide a control dependency
- * and thereby orders future stores. See the comment on top.
- */
-static inline __refcount_check
-bool refcount_inc_not_zero(refcount_t *r)
-{
-       unsigned int old, new, val = atomic_read(&r->refs);
-
-       for (;;) {
-               new = val + 1;
-
-               if (!val)
-                       return false;
-
-               if (unlikely(!new))
-                       return true;
-
-               old = atomic_cmpxchg_relaxed(&r->refs, val, new);
-               if (old == val)
-                       break;
-
-               val = old;
-       }
-
-       REFCOUNT_WARN(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n");
-
-       return true;
-}
-
-/*
- * Similar to atomic_inc(), will saturate at UINT_MAX and WARN.
- *
- * Provides no memory ordering, it is assumed the caller already has a
- * reference on the object, will WARN when this is not so.
- */
-static inline void refcount_inc(refcount_t *r)
-{
-       REFCOUNT_WARN(!refcount_inc_not_zero(r), "refcount_t: increment on 0; use-after-free.\n");
-}
-
-/*
- * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
- * decrement when saturated at UINT_MAX.
- *
- * Provides release memory ordering, such that prior loads and stores are done
- * before, and provides a control dependency such that free() must come after.
- * See the comment on top.
- */
-static inline __refcount_check
-bool refcount_sub_and_test(unsigned int i, refcount_t *r)
-{
-       unsigned int old, new, val = atomic_read(&r->refs);
-
-       for (;;) {
-               if (unlikely(val == UINT_MAX))
-                       return false;
-
-               new = val - i;
-               if (new > val) {
-                       REFCOUNT_WARN(new > val, "refcount_t: underflow; use-after-free.\n");
-                       return false;
-               }
-
-               old = atomic_cmpxchg_release(&r->refs, val, new);
-               if (old == val)
-                       break;
-
-               val = old;
-       }
-
-       return !new;
-}
-
-static inline __refcount_check
-bool refcount_dec_and_test(refcount_t *r)
-{
-       return refcount_sub_and_test(1, r);
-}
+extern __must_check bool refcount_add_not_zero(unsigned int i, refcount_t *r);
+extern void refcount_add(unsigned int i, refcount_t *r);
 
-/*
- * Similar to atomic_dec(), it will WARN on underflow and fail to decrement
- * when saturated at UINT_MAX.
- *
- * Provides release memory ordering, such that prior loads and stores are done
- * before.
- */
-static inline
-void refcount_dec(refcount_t *r)
-{
-       REFCOUNT_WARN(refcount_dec_and_test(r), "refcount_t: decrement hit 0; leaking memory.\n");
-}
-
-/*
- * No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the
- * success thereof.
- *
- * Like all decrement operations, it provides release memory order and provides
- * a control dependency.
- *
- * It can be used like a try-delete operator; this explicit case is provided
- * and not cmpxchg in generic, because that would allow implementing unsafe
- * operations.
- */
-static inline __refcount_check
-bool refcount_dec_if_one(refcount_t *r)
-{
-       return atomic_cmpxchg_release(&r->refs, 1, 0) == 1;
-}
-
-/*
- * No atomic_t counterpart, it decrements unless the value is 1, in which case
- * it will return false.
- *
- * Was often done like: atomic_add_unless(&var, -1, 1)
- */
-static inline __refcount_check
-bool refcount_dec_not_one(refcount_t *r)
-{
-       unsigned int old, new, val = atomic_read(&r->refs);
+extern __must_check bool refcount_inc_not_zero(refcount_t *r);
+extern void refcount_inc(refcount_t *r);
 
-       for (;;) {
-               if (unlikely(val == UINT_MAX))
-                       return true;
+extern __must_check bool refcount_sub_and_test(unsigned int i, refcount_t *r);
+extern void refcount_sub(unsigned int i, refcount_t *r);
 
-               if (val == 1)
-                       return false;
+extern __must_check bool refcount_dec_and_test(refcount_t *r);
+extern void refcount_dec(refcount_t *r);
 
-               new = val - 1;
-               if (new > val) {
-                       REFCOUNT_WARN(new > val, "refcount_t: underflow; use-after-free.\n");
-                       return true;
-               }
-
-               old = atomic_cmpxchg_release(&r->refs, val, new);
-               if (old == val)
-                       break;
-
-               val = old;
-       }
-
-       return true;
-}
-
-/*
- * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail
- * to decrement when saturated at UINT_MAX.
- *
- * Provides release memory ordering, such that prior loads and stores are done
- * before, and provides a control dependency such that free() must come after.
- * See the comment on top.
- */
-static inline __refcount_check
-bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock)
-{
-       if (refcount_dec_not_one(r))
-               return false;
-
-       mutex_lock(lock);
-       if (!refcount_dec_and_test(r)) {
-               mutex_unlock(lock);
-               return false;
-       }
-
-       return true;
-}
-
-/*
- * Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to
- * decrement when saturated at UINT_MAX.
- *
- * Provides release memory ordering, such that prior loads and stores are done
- * before, and provides a control dependency such that free() must come after.
- * See the comment on top.
- */
-static inline __refcount_check
-bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock)
-{
-       if (refcount_dec_not_one(r))
-               return false;
-
-       spin_lock(lock);
-       if (!refcount_dec_and_test(r)) {
-               spin_unlock(lock);
-               return false;
-       }
-
-       return true;
-}
+extern __must_check bool refcount_dec_if_one(refcount_t *r);
+extern __must_check bool refcount_dec_not_one(refcount_t *r);
+extern __must_check bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock);
+extern __must_check bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock);
 
 #endif /* _LINUX_REFCOUNT_H */
index 8a511c0985aafe0a18722c9dd701cf8326dcad59..20d157a518a7dcb14763f5bfd7e317c0a6537387 100644 (file)
@@ -204,8 +204,11 @@ static inline void cache_put(struct cache_head *h, struct cache_detail *cd)
        kref_put(&h->ref, cd->cache_put);
 }
 
-static inline int cache_is_expired(struct cache_detail *detail, struct cache_head *h)
+static inline bool cache_is_expired(struct cache_detail *detail, struct cache_head *h)
 {
+       if (!test_bit(CACHE_VALID, &h->flags))
+               return false;
+
        return  (h->expiry_time < seconds_since_boot()) ||
                (detail->flush_time >= h->last_refresh);
 }
@@ -227,6 +230,7 @@ extern void sunrpc_destroy_cache_detail(struct cache_detail *cd);
 extern int sunrpc_cache_register_pipefs(struct dentry *parent, const char *,
                                        umode_t, struct cache_detail *);
 extern void sunrpc_cache_unregister_pipefs(struct cache_detail *);
+extern void sunrpc_cache_unhash(struct cache_detail *, struct cache_head *);
 
 /* Must store cache_detail in seq_file->private if using next three functions */
 extern void *cache_seq_start(struct seq_file *file, loff_t *pos);
index cfda6adcf33cfcf3c28e46066ec294c6d2902389..245fc59b73247d744682c128bfcae1270e146c26 100644 (file)
@@ -109,6 +109,15 @@ struct rpcrdma_msg {
        } rm_body;
 };
 
+/*
+ * XDR sizes, in quads
+ */
+enum {
+       rpcrdma_fixed_maxsz     = 4,
+       rpcrdma_segment_maxsz   = 4,
+       rpcrdma_readchunk_maxsz = 2 + rpcrdma_segment_maxsz,
+};
+
 /*
  * Smallest RPC/RDMA header: rm_xid through rm_type, then rm_nochunks
  */
index 7321ae933867566013a250623564d722d2800305..e770abeed32d7117c4f2d363f9d7370a60d2c55f 100644 (file)
@@ -400,10 +400,14 @@ struct svc_version {
        struct svc_procedure *  vs_proc;        /* per-procedure info */
        u32                     vs_xdrsize;     /* xdrsize needed for this version */
 
-       unsigned int            vs_hidden : 1,  /* Don't register with portmapper.
-                                                * Only used for nfsacl so far. */
-                               vs_rpcb_optnl:1;/* Don't care the result of register.
-                                                * Only used for nfsv4. */
+       /* Don't register with rpcbind */
+       bool                    vs_hidden;
+
+       /* Don't care if the rpcbind registration fails */
+       bool                    vs_rpcb_optnl;
+
+       /* Need xprt with congestion control */
+       bool                    vs_need_cong_ctrl;
 
        /* Override dispatch function (e.g. when caching replies).
         * A return value of 0 means drop the request. 
index 757fb963696c76b3ab24f754ff89424fb428fc79..b105f73e3ca26355b2ee8b32651b48526a945899 100644 (file)
@@ -70,7 +70,7 @@ extern atomic_t rdma_stat_sq_prod;
  * completes.
  */
 struct svc_rdma_op_ctxt {
-       struct list_head free;
+       struct list_head list;
        struct svc_rdma_op_ctxt *read_hdr;
        struct svc_rdma_fastreg_mr *frmr;
        int hdr_count;
@@ -78,7 +78,6 @@ struct svc_rdma_op_ctxt {
        struct ib_cqe cqe;
        struct ib_cqe reg_cqe;
        struct ib_cqe inv_cqe;
-       struct list_head dto_q;
        u32 byte_len;
        u32 position;
        struct svcxprt_rdma *xprt;
@@ -141,7 +140,8 @@ struct svcxprt_rdma {
        atomic_t             sc_sq_avail;       /* SQEs ready to be consumed */
        unsigned int         sc_sq_depth;       /* Depth of SQ */
        unsigned int         sc_rq_depth;       /* Depth of RQ */
-       u32                  sc_max_requests;   /* Forward credits */
+       __be32               sc_fc_credits;     /* Forward credits */
+       u32                  sc_max_requests;   /* Max requests */
        u32                  sc_max_bc_requests;/* Backward credits */
        int                  sc_max_req_size;   /* Size of each RQ WR buf */
 
@@ -171,7 +171,6 @@ struct svcxprt_rdma {
 
        wait_queue_head_t    sc_send_wait;      /* SQ exhaustion waitlist */
        unsigned long        sc_flags;
-       struct list_head     sc_dto_q;          /* DTO tasklet I/O pending Q */
        struct list_head     sc_read_complete_q;
        struct work_struct   sc_work;
 };
@@ -214,11 +213,7 @@ extern void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *, int);
 extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int);
 extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int,
                                            __be32, __be64, u32);
-extern void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *,
-                                            struct rpcrdma_msg *,
-                                            struct rpcrdma_msg *,
-                                            enum rpcrdma_proc);
-extern int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *);
+extern unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp);
 
 /* svc_rdma_recvfrom.c */
 extern int svc_rdma_recvfrom(struct svc_rqst *);
index 7440290f64acd3694dfc5c17618c55f6253aae01..ddb7f94a9d06ecc48828b7b00230662b85768d64 100644 (file)
@@ -67,6 +67,7 @@ struct svc_xprt {
 #define XPT_CACHE_AUTH 11              /* cache auth info */
 #define XPT_LOCAL      12              /* connection from loopback interface */
 #define XPT_KILL_TEMP   13             /* call xpo_kill_temp_xprt before closing */
+#define XPT_CONG_CTRL  14              /* has congestion control */
 
        struct svc_serv         *xpt_server;    /* service for transport */
        atomic_t                xpt_reserved;   /* space on outq that is rsvd */
index 7550e9176a54ea839ec792d13f223ac0be82f1a1..c111a91adcc05ab5de87f561aef99e50fbe94666 100644 (file)
@@ -3,7 +3,6 @@
 
 #include <linux/types.h>
 #include <linux/compiler.h>
-#include <linux/sysctl.h>
 #include <linux/in.h>
 #include <linux/in6.h>
 
index 3efc0ca18345e397e6c9704f2777d61b192ed455..79da349f10605be2aa890e10b563008082cfd9ff 100644 (file)
@@ -2,6 +2,7 @@
 #define _UAPI_XT_HASHLIMIT_H
 
 #include <linux/types.h>
+#include <linux/limits.h>
 #include <linux/if.h>
 
 /* timings are in milliseconds. */
index 0df7bd5d2fb17cf4b9df3b300451cce13075d194..c3be256107c6421432e8a63b041306e224874c2e 100644 (file)
@@ -32,7 +32,8 @@
 #define NFSEXP_ASYNC           0x0010
 #define NFSEXP_GATHERED_WRITES 0x0020
 #define NFSEXP_NOREADDIRPLUS    0x0040
-/* 80 100 currently unused */
+#define NFSEXP_SECURITY_LABEL  0x0080
+/* 0x100 currently unused */
 #define NFSEXP_NOHIDE          0x0200
 #define NFSEXP_NOSUBTREECHECK  0x0400
 #define        NFSEXP_NOAUTHNLM        0x0800          /* Don't authenticate NLM requests - just trust */
@@ -53,7 +54,7 @@
 #define NFSEXP_PNFS            0x20000
 
 /* All flags that we claim to support.  (Note we don't support NOACL.) */
-#define NFSEXP_ALLFLAGS                0x3FE7F
+#define NFSEXP_ALLFLAGS                0x3FEFF
 
 /* The flags that may vary depending on security flavor: */
 #define NFSEXP_SECINFO_FLAGS   (NFSEXP_READONLY | NFSEXP_ROOTSQUASH \
index 5b4e0b98f4eb1d5f4abae3afeb14318494767953..1031bdf9f0125110088f0880699349c485d9bfcf 100644 (file)
@@ -455,7 +455,7 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
                                void __user *buffer, size_t *lenp,
                                loff_t *ppos)
 {
-       int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+       int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 
        if (ret || !write)
                return ret;
@@ -3522,6 +3522,8 @@ static void perf_event_enable_on_exec(int ctxn)
        if (enabled) {
                clone_ctx = unclone_ctx(ctx);
                ctx_resched(cpuctx, ctx, event_type);
+       } else {
+               ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
        }
        perf_ctx_unlock(cpuctx, ctx);
 
@@ -9955,6 +9957,7 @@ SYSCALL_DEFINE5(perf_event_open,
                 * of swizzling perf_event::ctx.
                 */
                perf_remove_from_context(group_leader, 0);
+               put_ctx(gctx);
 
                list_for_each_entry(sibling, &group_leader->sibling_list,
                                    group_entry) {
@@ -9993,13 +9996,6 @@ SYSCALL_DEFINE5(perf_event_open,
                perf_event__state_init(group_leader);
                perf_install_in_context(ctx, group_leader, group_leader->cpu);
                get_ctx(ctx);
-
-               /*
-                * Now that all events are installed in @ctx, nothing
-                * references @gctx anymore, so drop the last reference we have
-                * on it.
-                */
-               put_ctx(gctx);
        }
 
        /*
index 6ea1925ac5c05009bb6764b90e1751ae8bb747ae..bbfb917a9b4998f8254b4cae92f2dce129761bbb 100644 (file)
@@ -1090,6 +1090,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
        int ret = 0;
 
        rq = task_rq_lock(p, &rf);
+       update_rq_clock(rq);
 
        if (p->flags & PF_KTHREAD) {
                /*
@@ -5560,7 +5561,7 @@ static void migrate_tasks(struct rq *dead_rq)
 {
        struct rq *rq = dead_rq;
        struct task_struct *next, *stop = rq->stop;
-       struct rq_flags rf, old_rf;
+       struct rq_flags rf;
        int dest_cpu;
 
        /*
@@ -5579,7 +5580,9 @@ static void migrate_tasks(struct rq *dead_rq)
         * class method both need to have an up-to-date
         * value of rq->clock[_task]
         */
+       rq_pin_lock(rq, &rf);
        update_rq_clock(rq);
+       rq_unpin_lock(rq, &rf);
 
        for (;;) {
                /*
@@ -5592,7 +5595,7 @@ static void migrate_tasks(struct rq *dead_rq)
                /*
                 * pick_next_task() assumes pinned rq->lock:
                 */
-               rq_pin_lock(rq, &rf);
+               rq_repin_lock(rq, &rf);
                next = pick_next_task(rq, &fake_task, &rf);
                BUG_ON(!next);
                next->sched_class->put_prev_task(rq, next);
@@ -5621,13 +5624,6 @@ static void migrate_tasks(struct rq *dead_rq)
                        continue;
                }
 
-               /*
-                * __migrate_task() may return with a different
-                * rq->lock held and a new cookie in 'rf', but we need
-                * to preserve rf::clock_update_flags for 'dead_rq'.
-                */
-               old_rf = rf;
-
                /* Find suitable destination for @next, with force if needed. */
                dest_cpu = select_fallback_rq(dead_rq->cpu, next);
 
@@ -5636,7 +5632,6 @@ static void migrate_tasks(struct rq *dead_rq)
                        raw_spin_unlock(&rq->lock);
                        rq = dead_rq;
                        raw_spin_lock(&rq->lock);
-                       rf = old_rf;
                }
                raw_spin_unlock(&next->pi_lock);
        }
@@ -6819,11 +6814,20 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
        if (IS_ERR(tg))
                return ERR_PTR(-ENOMEM);
 
-       sched_online_group(tg, parent);
-
        return &tg->css;
 }
 
+/* Expose task group only after completing cgroup initialization */
+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+       struct task_group *tg = css_tg(css);
+       struct task_group *parent = css_tg(css->parent);
+
+       if (parent)
+               sched_online_group(tg, parent);
+       return 0;
+}
+
 static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
 {
        struct task_group *tg = css_tg(css);
@@ -7229,6 +7233,7 @@ static struct cftype cpu_files[] = {
 
 struct cgroup_subsys cpu_cgrp_subsys = {
        .css_alloc      = cpu_cgroup_css_alloc,
+       .css_online     = cpu_cgroup_css_online,
        .css_released   = cpu_cgroup_css_released,
        .css_free       = cpu_cgroup_css_free,
        .fork           = cpu_cgroup_fork,
index 8f69579dfac3e78a57246b6b4447d29e68a2ba02..0c8b78a9ae2ef97a1e83753146959b2d7b39d16d 100644 (file)
@@ -559,7 +559,7 @@ config SBITMAP
        bool
 
 config PARMAN
-       tristate
+       tristate "parman" if COMPILE_TEST
 
 config PRIME_NUMBERS
        tristate
index 55735c9bdb75a23ef92b3a531a6cbffc57ff6b66..97d62c2da6c25dd5721f8c1c75264c83201f7247 100644 (file)
@@ -729,19 +729,6 @@ source "lib/Kconfig.kmemcheck"
 
 source "lib/Kconfig.kasan"
 
-config DEBUG_REFCOUNT
-       bool "Verbose refcount checks"
-       help
-         Say Y here if you want reference counters (refcount_t and kref) to
-         generate WARNs on dubious usage. Without this refcount_t will still
-         be a saturating counter and avoid Use-After-Free by turning it into
-         a resource leak Denial-Of-Service.
-
-         Use of this option will increase kernel text size but will alert the
-         admin of potential abuse.
-
-         If in doubt, say "N".
-
 endmenu # "Memory Debugging"
 
 config ARCH_HAS_KCOV
index c9023efbd4ca2b111a3b76e5c85b0865a64beeb4..469b2392893ab080b8529e2ef5a7d2993035ca90 100644 (file)
@@ -38,7 +38,7 @@ obj-y += bcd.o div64.o sort.o parser.o debug_locks.o random32.o \
         gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
         bsearch.o find_bit.o llist.o memweight.o kfifo.o \
         percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o \
-        once.o
+        once.o refcount.o
 obj-y += string_helpers.o
 obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
 obj-y += hexdump.o
diff --git a/lib/refcount.c b/lib/refcount.c
new file mode 100644 (file)
index 0000000..1d33366
--- /dev/null
@@ -0,0 +1,267 @@
+/*
+ * Variant of atomic_t specialized for reference counts.
+ *
+ * The interface matches the atomic_t interface (to aid in porting) but only
+ * provides the few functions one should use for reference counting.
+ *
+ * It differs in that the counter saturates at UINT_MAX and will not move once
+ * there. This avoids wrapping the counter and causing 'spurious'
+ * use-after-free issues.
+ *
+ * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
+ * and provide only what is strictly required for refcounts.
+ *
+ * The increments are fully relaxed; these will not provide ordering. The
+ * rationale is that whatever is used to obtain the object we're increasing the
+ * reference count on will provide the ordering. For locked data structures,
+ * its the lock acquire, for RCU/lockless data structures its the dependent
+ * load.
+ *
+ * Do note that inc_not_zero() provides a control dependency which will order
+ * future stores against the inc, this ensures we'll never modify the object
+ * if we did not in fact acquire a reference.
+ *
+ * The decrements will provide release order, such that all the prior loads and
+ * stores will be issued before, it also provides a control dependency, which
+ * will order us against the subsequent free().
+ *
+ * The control dependency is against the load of the cmpxchg (ll/sc) that
+ * succeeded. This means the stores aren't fully ordered, but this is fine
+ * because the 1->0 transition indicates no concurrency.
+ *
+ * Note that the allocator is responsible for ordering things between free()
+ * and alloc().
+ *
+ */
+
+#include <linux/refcount.h>
+#include <linux/bug.h>
+
+bool refcount_add_not_zero(unsigned int i, refcount_t *r)
+{
+       unsigned int old, new, val = atomic_read(&r->refs);
+
+       for (;;) {
+               if (!val)
+                       return false;
+
+               if (unlikely(val == UINT_MAX))
+                       return true;
+
+               new = val + i;
+               if (new < val)
+                       new = UINT_MAX;
+               old = atomic_cmpxchg_relaxed(&r->refs, val, new);
+               if (old == val)
+                       break;
+
+               val = old;
+       }
+
+       WARN(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n");
+
+       return true;
+}
+EXPORT_SYMBOL_GPL(refcount_add_not_zero);
+
+void refcount_add(unsigned int i, refcount_t *r)
+{
+       WARN(!refcount_add_not_zero(i, r), "refcount_t: addition on 0; use-after-free.\n");
+}
+EXPORT_SYMBOL_GPL(refcount_add);
+
+/*
+ * Similar to atomic_inc_not_zero(), will saturate at UINT_MAX and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller has guaranteed the
+ * object memory to be stable (RCU, etc.). It does provide a control dependency
+ * and thereby orders future stores. See the comment on top.
+ */
+bool refcount_inc_not_zero(refcount_t *r)
+{
+       unsigned int old, new, val = atomic_read(&r->refs);
+
+       for (;;) {
+               new = val + 1;
+
+               if (!val)
+                       return false;
+
+               if (unlikely(!new))
+                       return true;
+
+               old = atomic_cmpxchg_relaxed(&r->refs, val, new);
+               if (old == val)
+                       break;
+
+               val = old;
+       }
+
+       WARN(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n");
+
+       return true;
+}
+EXPORT_SYMBOL_GPL(refcount_inc_not_zero);
+
+/*
+ * Similar to atomic_inc(), will saturate at UINT_MAX and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller already has a
+ * reference on the object, will WARN when this is not so.
+ */
+void refcount_inc(refcount_t *r)
+{
+       WARN(!refcount_inc_not_zero(r), "refcount_t: increment on 0; use-after-free.\n");
+}
+EXPORT_SYMBOL_GPL(refcount_inc);
+
+bool refcount_sub_and_test(unsigned int i, refcount_t *r)
+{
+       unsigned int old, new, val = atomic_read(&r->refs);
+
+       for (;;) {
+               if (unlikely(val == UINT_MAX))
+                       return false;
+
+               new = val - i;
+               if (new > val) {
+                       WARN(new > val, "refcount_t: underflow; use-after-free.\n");
+                       return false;
+               }
+
+               old = atomic_cmpxchg_release(&r->refs, val, new);
+               if (old == val)
+                       break;
+
+               val = old;
+       }
+
+       return !new;
+}
+EXPORT_SYMBOL_GPL(refcount_sub_and_test);
+
+/*
+ * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
+ * decrement when saturated at UINT_MAX.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides a control dependency such that free() must come after.
+ * See the comment on top.
+ */
+bool refcount_dec_and_test(refcount_t *r)
+{
+       return refcount_sub_and_test(1, r);
+}
+EXPORT_SYMBOL_GPL(refcount_dec_and_test);
+
+/*
+ * Similar to atomic_dec(), it will WARN on underflow and fail to decrement
+ * when saturated at UINT_MAX.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before.
+ */
+
+void refcount_dec(refcount_t *r)
+{
+       WARN(refcount_dec_and_test(r), "refcount_t: decrement hit 0; leaking memory.\n");
+}
+EXPORT_SYMBOL_GPL(refcount_dec);
+
+/*
+ * No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the
+ * success thereof.
+ *
+ * Like all decrement operations, it provides release memory order and provides
+ * a control dependency.
+ *
+ * It can be used like a try-delete operator; this explicit case is provided
+ * and not cmpxchg in generic, because that would allow implementing unsafe
+ * operations.
+ */
+bool refcount_dec_if_one(refcount_t *r)
+{
+       return atomic_cmpxchg_release(&r->refs, 1, 0) == 1;
+}
+EXPORT_SYMBOL_GPL(refcount_dec_if_one);
+
+/*
+ * No atomic_t counterpart, it decrements unless the value is 1, in which case
+ * it will return false.
+ *
+ * Was often done like: atomic_add_unless(&var, -1, 1)
+ */
+bool refcount_dec_not_one(refcount_t *r)
+{
+       unsigned int old, new, val = atomic_read(&r->refs);
+
+       for (;;) {
+               if (unlikely(val == UINT_MAX))
+                       return true;
+
+               if (val == 1)
+                       return false;
+
+               new = val - 1;
+               if (new > val) {
+                       WARN(new > val, "refcount_t: underflow; use-after-free.\n");
+                       return true;
+               }
+
+               old = atomic_cmpxchg_release(&r->refs, val, new);
+               if (old == val)
+                       break;
+
+               val = old;
+       }
+
+       return true;
+}
+EXPORT_SYMBOL_GPL(refcount_dec_not_one);
+
+/*
+ * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail
+ * to decrement when saturated at UINT_MAX.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides a control dependency such that free() must come after.
+ * See the comment on top.
+ */
+bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock)
+{
+       if (refcount_dec_not_one(r))
+               return false;
+
+       mutex_lock(lock);
+       if (!refcount_dec_and_test(r)) {
+               mutex_unlock(lock);
+               return false;
+       }
+
+       return true;
+}
+EXPORT_SYMBOL_GPL(refcount_dec_and_mutex_lock);
+
+/*
+ * Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to
+ * decrement when saturated at UINT_MAX.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides a control dependency such that free() must come after.
+ * See the comment on top.
+ */
+bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock)
+{
+       if (refcount_dec_not_one(r))
+               return false;
+
+       spin_lock(lock);
+       if (!refcount_dec_and_test(r)) {
+               spin_unlock(lock);
+               return false;
+       }
+
+       return true;
+}
+EXPORT_SYMBOL_GPL(refcount_dec_and_lock);
+
index 172454e6b979e935ef0d184953b6c35b06c32592..c5b9b9351cec8acdf5fcdafaf88fc58562896391 100644 (file)
@@ -146,9 +146,7 @@ static void bucket_table_free(const struct bucket_table *tbl)
        if (tbl->nest)
                nested_bucket_table_free(tbl);
 
-       if (tbl)
-               kvfree(tbl->locks);
-
+       kvfree(tbl->locks);
        kvfree(tbl);
 }
 
@@ -1123,12 +1121,13 @@ struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl,
        union nested_table *ntbl;
 
        ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]);
-       ntbl = rht_dereference_bucket(ntbl[index].table, tbl, hash);
+       ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash);
        subhash >>= tbl->nest;
 
        while (ntbl && size > (1 << shift)) {
                index = subhash & ((1 << shift) - 1);
-               ntbl = rht_dereference_bucket(ntbl[index].table, tbl, hash);
+               ntbl = rht_dereference_bucket_rcu(ntbl[index].table,
+                                                 tbl, hash);
                size >>= shift;
                subhash >>= shift;
        }
index fe9f3a785804d9a8902cb8d347aa6672fc4d022a..35e32243693c985b8504b1e529a72535cefbb0fa 100644 (file)
@@ -334,7 +334,7 @@ static int test_parman_check_array(struct test_parman *test_parman,
                last_priority = item->prio->priority;
 
                if (item->parman_item.index != i) {
-                       pr_err("Item has different index in compare to where it actualy is (%lu != %d)\n",
+                       pr_err("Item has different index in compare to where it actually is (%lu != %d)\n",
                               item->parman_item.index, i);
                        return -EINVAL;
                }
index 50f040fdb2a97f4278fce4130a0cf4bbe3c87052..b9233b9903990bd38721213ce287a8045debd8c7 100644 (file)
@@ -69,8 +69,8 @@ int ceph_cls_lock(struct ceph_osd_client *osdc,
        dout("%s lock_name %s type %d cookie %s tag %s desc %s flags 0x%x\n",
             __func__, lock_name, type, cookie, tag, desc, flags);
        ret = ceph_osdc_call(osdc, oid, oloc, "lock", "lock",
-                            CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-                            lock_op_page, lock_op_buf_size, NULL, NULL);
+                            CEPH_OSD_FLAG_WRITE, lock_op_page,
+                            lock_op_buf_size, NULL, NULL);
 
        dout("%s: status %d\n", __func__, ret);
        __free_page(lock_op_page);
@@ -117,8 +117,8 @@ int ceph_cls_unlock(struct ceph_osd_client *osdc,
 
        dout("%s lock_name %s cookie %s\n", __func__, lock_name, cookie);
        ret = ceph_osdc_call(osdc, oid, oloc, "lock", "unlock",
-                            CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-                            unlock_op_page, unlock_op_buf_size, NULL, NULL);
+                            CEPH_OSD_FLAG_WRITE, unlock_op_page,
+                            unlock_op_buf_size, NULL, NULL);
 
        dout("%s: status %d\n", __func__, ret);
        __free_page(unlock_op_page);
@@ -170,8 +170,8 @@ int ceph_cls_break_lock(struct ceph_osd_client *osdc,
        dout("%s lock_name %s cookie %s locker %s%llu\n", __func__, lock_name,
             cookie, ENTITY_NAME(*locker));
        ret = ceph_osdc_call(osdc, oid, oloc, "lock", "break_lock",
-                            CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-                            break_op_page, break_op_buf_size, NULL, NULL);
+                            CEPH_OSD_FLAG_WRITE, break_op_page,
+                            break_op_buf_size, NULL, NULL);
 
        dout("%s: status %d\n", __func__, ret);
        __free_page(break_op_page);
@@ -278,7 +278,7 @@ int ceph_cls_lock_info(struct ceph_osd_client *osdc,
        int get_info_op_buf_size;
        int name_len = strlen(lock_name);
        struct page *get_info_op_page, *reply_page;
-       size_t reply_len;
+       size_t reply_len = PAGE_SIZE;
        void *p, *end;
        int ret;
 
index 80d7c3a97cb84355e82e9d8f4c83fbf5b0d82893..5bf94c04f64547e2cfff79c0655bcc68944f4e12 100644 (file)
@@ -45,7 +45,6 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p)
 
 void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
 {
-       kfree(b->h.perm);
        kfree(b->h.items);
        kfree(b);
 }
@@ -54,14 +53,12 @@ void crush_destroy_bucket_list(struct crush_bucket_list *b)
 {
        kfree(b->item_weights);
        kfree(b->sum_weights);
-       kfree(b->h.perm);
        kfree(b->h.items);
        kfree(b);
 }
 
 void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
 {
-       kfree(b->h.perm);
        kfree(b->h.items);
        kfree(b->node_weights);
        kfree(b);
@@ -71,7 +68,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
 {
        kfree(b->straws);
        kfree(b->item_weights);
-       kfree(b->h.perm);
        kfree(b->h.items);
        kfree(b);
 }
@@ -79,7 +75,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
 void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b)
 {
        kfree(b->item_weights);
-       kfree(b->h.perm);
        kfree(b->h.items);
        kfree(b);
 }
index 130ab407c5ecf8ca5c0943759efff91c7bf258e8..b5cd8c21bfdfbf4d85bd93993f1f02807c5200be 100644 (file)
@@ -54,7 +54,6 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size
        return -1;
 }
 
-
 /*
  * bucket choose methods
  *
@@ -72,59 +71,60 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size
  * Since this is expensive, we optimize for the r=0 case, which
  * captures the vast majority of calls.
  */
-static int bucket_perm_choose(struct crush_bucket *bucket,
+static int bucket_perm_choose(const struct crush_bucket *bucket,
+                             struct crush_work_bucket *work,
                              int x, int r)
 {
        unsigned int pr = r % bucket->size;
        unsigned int i, s;
 
        /* start a new permutation if @x has changed */
-       if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) {
+       if (work->perm_x != (__u32)x || work->perm_n == 0) {
                dprintk("bucket %d new x=%d\n", bucket->id, x);
-               bucket->perm_x = x;
+               work->perm_x = x;
 
                /* optimize common r=0 case */
                if (pr == 0) {
                        s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
                                bucket->size;
-                       bucket->perm[0] = s;
-                       bucket->perm_n = 0xffff;   /* magic value, see below */
+                       work->perm[0] = s;
+                       work->perm_n = 0xffff;   /* magic value, see below */
                        goto out;
                }
 
                for (i = 0; i < bucket->size; i++)
-                       bucket->perm[i] = i;
-               bucket->perm_n = 0;
-       } else if (bucket->perm_n == 0xffff) {
+                       work->perm[i] = i;
+               work->perm_n = 0;
+       } else if (work->perm_n == 0xffff) {
                /* clean up after the r=0 case above */
                for (i = 1; i < bucket->size; i++)
-                       bucket->perm[i] = i;
-               bucket->perm[bucket->perm[0]] = 0;
-               bucket->perm_n = 1;
+                       work->perm[i] = i;
+               work->perm[work->perm[0]] = 0;
+               work->perm_n = 1;
        }
 
        /* calculate permutation up to pr */
-       for (i = 0; i < bucket->perm_n; i++)
-               dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
-       while (bucket->perm_n <= pr) {
-               unsigned int p = bucket->perm_n;
+       for (i = 0; i < work->perm_n; i++)
+               dprintk(" perm_choose have %d: %d\n", i, work->perm[i]);
+       while (work->perm_n <= pr) {
+               unsigned int p = work->perm_n;
                /* no point in swapping the final entry */
                if (p < bucket->size - 1) {
                        i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
                                (bucket->size - p);
                        if (i) {
-                               unsigned int t = bucket->perm[p + i];
-                               bucket->perm[p + i] = bucket->perm[p];
-                               bucket->perm[p] = t;
+                               unsigned int t = work->perm[p + i];
+                               work->perm[p + i] = work->perm[p];
+                               work->perm[p] = t;
                        }
                        dprintk(" perm_choose swap %d with %d\n", p, p+i);
                }
-               bucket->perm_n++;
+               work->perm_n++;
        }
        for (i = 0; i < bucket->size; i++)
-               dprintk(" perm_choose  %d: %d\n", i, bucket->perm[i]);
+               dprintk(" perm_choose  %d: %d\n", i, work->perm[i]);
 
-       s = bucket->perm[pr];
+       s = work->perm[pr];
 out:
        dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
                bucket->size, x, r, pr, s);
@@ -132,14 +132,14 @@ out:
 }
 
 /* uniform */
-static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
-                                int x, int r)
+static int bucket_uniform_choose(const struct crush_bucket_uniform *bucket,
+                                struct crush_work_bucket *work, int x, int r)
 {
-       return bucket_perm_choose(&bucket->h, x, r);
+       return bucket_perm_choose(&bucket->h, work, x, r);
 }
 
 /* list */
-static int bucket_list_choose(struct crush_bucket_list *bucket,
+static int bucket_list_choose(const struct crush_bucket_list *bucket,
                              int x, int r)
 {
        int i;
@@ -155,8 +155,9 @@ static int bucket_list_choose(struct crush_bucket_list *bucket,
                w *= bucket->sum_weights[i];
                w = w >> 16;
                /*dprintk(" scaled %llx\n", w);*/
-               if (w < bucket->item_weights[i])
+               if (w < bucket->item_weights[i]) {
                        return bucket->h.items[i];
+               }
        }
 
        dprintk("bad list sums for bucket %d\n", bucket->h.id);
@@ -192,7 +193,7 @@ static int terminal(int x)
        return x & 1;
 }
 
-static int bucket_tree_choose(struct crush_bucket_tree *bucket,
+static int bucket_tree_choose(const struct crush_bucket_tree *bucket,
                              int x, int r)
 {
        int n;
@@ -224,7 +225,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket,
 
 /* straw */
 
-static int bucket_straw_choose(struct crush_bucket_straw *bucket,
+static int bucket_straw_choose(const struct crush_bucket_straw *bucket,
                               int x, int r)
 {
        __u32 i;
@@ -301,7 +302,7 @@ static __u64 crush_ln(unsigned int xin)
  *
  */
 
-static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
+static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
                                int x, int r)
 {
        unsigned int i, high = 0;
@@ -344,37 +345,42 @@ static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
                        high_draw = draw;
                }
        }
+
        return bucket->h.items[high];
 }
 
 
-static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
+static int crush_bucket_choose(const struct crush_bucket *in,
+                              struct crush_work_bucket *work,
+                              int x, int r)
 {
        dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
        BUG_ON(in->size == 0);
        switch (in->alg) {
        case CRUSH_BUCKET_UNIFORM:
-               return bucket_uniform_choose((struct crush_bucket_uniform *)in,
-                                         x, r);
+               return bucket_uniform_choose(
+                       (const struct crush_bucket_uniform *)in,
+                       work, x, r);
        case CRUSH_BUCKET_LIST:
-               return bucket_list_choose((struct crush_bucket_list *)in,
+               return bucket_list_choose((const struct crush_bucket_list *)in,
                                          x, r);
        case CRUSH_BUCKET_TREE:
-               return bucket_tree_choose((struct crush_bucket_tree *)in,
+               return bucket_tree_choose((const struct crush_bucket_tree *)in,
                                          x, r);
        case CRUSH_BUCKET_STRAW:
-               return bucket_straw_choose((struct crush_bucket_straw *)in,
-                                          x, r);
+               return bucket_straw_choose(
+                       (const struct crush_bucket_straw *)in,
+                       x, r);
        case CRUSH_BUCKET_STRAW2:
-               return bucket_straw2_choose((struct crush_bucket_straw2 *)in,
-                                           x, r);
+               return bucket_straw2_choose(
+                       (const struct crush_bucket_straw2 *)in,
+                       x, r);
        default:
                dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
                return in->items[0];
        }
 }
 
-
 /*
  * true if device is marked "out" (failed, fully offloaded)
  * of the cluster
@@ -416,7 +422,8 @@ static int is_out(const struct crush_map *map,
  * @parent_r: r value passed from the parent
  */
 static int crush_choose_firstn(const struct crush_map *map,
-                              struct crush_bucket *bucket,
+                              struct crush_work *work,
+                              const struct crush_bucket *bucket,
                               const __u32 *weight, int weight_max,
                               int x, int numrep, int type,
                               int *out, int outpos,
@@ -434,7 +441,7 @@ static int crush_choose_firstn(const struct crush_map *map,
        int rep;
        unsigned int ftotal, flocal;
        int retry_descent, retry_bucket, skip_rep;
-       struct crush_bucket *in = bucket;
+       const struct crush_bucket *in = bucket;
        int r;
        int i;
        int item = 0;
@@ -473,9 +480,13 @@ static int crush_choose_firstn(const struct crush_map *map,
                                if (local_fallback_retries > 0 &&
                                    flocal >= (in->size>>1) &&
                                    flocal > local_fallback_retries)
-                                       item = bucket_perm_choose(in, x, r);
+                                       item = bucket_perm_choose(
+                                               in, work->work[-1-in->id],
+                                               x, r);
                                else
-                                       item = crush_bucket_choose(in, x, r);
+                                       item = crush_bucket_choose(
+                                               in, work->work[-1-in->id],
+                                               x, r);
                                if (item >= map->max_devices) {
                                        dprintk("   bad item %d\n", item);
                                        skip_rep = 1;
@@ -518,19 +529,21 @@ static int crush_choose_firstn(const struct crush_map *map,
                                                        sub_r = r >> (vary_r-1);
                                                else
                                                        sub_r = 0;
-                                               if (crush_choose_firstn(map,
-                                                        map->buckets[-1-item],
-                                                        weight, weight_max,
-                                                        x, stable ? 1 : outpos+1, 0,
-                                                        out2, outpos, count,
-                                                        recurse_tries, 0,
-                                                        local_retries,
-                                                        local_fallback_retries,
-                                                        0,
-                                                        vary_r,
-                                                        stable,
-                                                        NULL,
-                                                        sub_r) <= outpos)
+                                               if (crush_choose_firstn(
+                                                           map,
+                                                           work,
+                                                           map->buckets[-1-item],
+                                                           weight, weight_max,
+                                                           x, stable ? 1 : outpos+1, 0,
+                                                           out2, outpos, count,
+                                                           recurse_tries, 0,
+                                                           local_retries,
+                                                           local_fallback_retries,
+                                                           0,
+                                                           vary_r,
+                                                           stable,
+                                                           NULL,
+                                                           sub_r) <= outpos)
                                                        /* didn't get leaf */
                                                        reject = 1;
                                        } else {
@@ -539,14 +552,12 @@ static int crush_choose_firstn(const struct crush_map *map,
                                        }
                                }
 
-                               if (!reject) {
+                               if (!reject && !collide) {
                                        /* out? */
                                        if (itemtype == 0)
                                                reject = is_out(map, weight,
                                                                weight_max,
                                                                item, x);
-                                       else
-                                               reject = 0;
                                }
 
 reject:
@@ -600,7 +611,8 @@ reject:
  *
  */
 static void crush_choose_indep(const struct crush_map *map,
-                              struct crush_bucket *bucket,
+                              struct crush_work *work,
+                              const struct crush_bucket *bucket,
                               const __u32 *weight, int weight_max,
                               int x, int left, int numrep, int type,
                               int *out, int outpos,
@@ -610,7 +622,7 @@ static void crush_choose_indep(const struct crush_map *map,
                               int *out2,
                               int parent_r)
 {
-       struct crush_bucket *in = bucket;
+       const struct crush_bucket *in = bucket;
        int endpos = outpos + left;
        int rep;
        unsigned int ftotal;
@@ -678,7 +690,9 @@ static void crush_choose_indep(const struct crush_map *map,
                                        break;
                                }
 
-                               item = crush_bucket_choose(in, x, r);
+                               item = crush_bucket_choose(
+                                       in, work->work[-1-in->id],
+                                       x, r);
                                if (item >= map->max_devices) {
                                        dprintk("   bad item %d\n", item);
                                        out[rep] = CRUSH_ITEM_NONE;
@@ -724,13 +738,15 @@ static void crush_choose_indep(const struct crush_map *map,
 
                                if (recurse_to_leaf) {
                                        if (item < 0) {
-                                               crush_choose_indep(map,
-                                                  map->buckets[-1-item],
-                                                  weight, weight_max,
-                                                  x, 1, numrep, 0,
-                                                  out2, rep,
-                                                  recurse_tries, 0,
-                                                  0, NULL, r);
+                                               crush_choose_indep(
+                                                       map,
+                                                       work,
+                                                       map->buckets[-1-item],
+                                                       weight, weight_max,
+                                                       x, 1, numrep, 0,
+                                                       out2, rep,
+                                                       recurse_tries, 0,
+                                                       0, NULL, r);
                                                if (out2[rep] == CRUSH_ITEM_NONE) {
                                                        /* placed nothing; no leaf */
                                                        break;
@@ -781,6 +797,53 @@ static void crush_choose_indep(const struct crush_map *map,
 #endif
 }
 
+
+/*
+ * This takes a chunk of memory and sets it up to be a shiny new
+ * working area for a CRUSH placement computation. It must be called
+ * on any newly allocated memory before passing it in to
+ * crush_do_rule. It may be used repeatedly after that, so long as the
+ * map has not changed. If the map /has/ changed, you must make sure
+ * the working size is no smaller than what was allocated and re-run
+ * crush_init_workspace.
+ *
+ * If you do retain the working space between calls to crush, make it
+ * thread-local.
+ */
+void crush_init_workspace(const struct crush_map *map, void *v)
+{
+       struct crush_work *w = v;
+       __s32 b;
+
+       /*
+        * We work by moving through the available space and setting
+        * values and pointers as we go.
+        *
+        * It's a bit like Forth's use of the 'allot' word since we
+        * set the pointer first and then reserve the space for it to
+        * point to by incrementing the point.
+        */
+       v += sizeof(struct crush_work *);
+       w->work = v;
+       v += map->max_buckets * sizeof(struct crush_work_bucket *);
+       for (b = 0; b < map->max_buckets; ++b) {
+               if (!map->buckets[b])
+                       continue;
+
+               w->work[b] = v;
+               switch (map->buckets[b]->alg) {
+               default:
+                       v += sizeof(struct crush_work_bucket);
+                       break;
+               }
+               w->work[b]->perm_x = 0;
+               w->work[b]->perm_n = 0;
+               w->work[b]->perm = v;
+               v += map->buckets[b]->size * sizeof(__u32);
+       }
+       BUG_ON(v - (void *)w != map->working_size);
+}
+
 /**
  * crush_do_rule - calculate a mapping with the given input and rule
  * @map: the crush_map
@@ -790,24 +853,25 @@ static void crush_choose_indep(const struct crush_map *map,
  * @result_max: maximum result size
  * @weight: weight vector (for map leaves)
  * @weight_max: size of weight vector
- * @scratch: scratch vector for private use; must be >= 3 * result_max
+ * @cwin: pointer to at least crush_work_size() bytes of memory
  */
 int crush_do_rule(const struct crush_map *map,
                  int ruleno, int x, int *result, int result_max,
                  const __u32 *weight, int weight_max,
-                 int *scratch)
+                 void *cwin)
 {
        int result_len;
-       int *a = scratch;
-       int *b = scratch + result_max;
-       int *c = scratch + result_max*2;
+       struct crush_work *cw = cwin;
+       int *a = cwin + map->working_size;
+       int *b = a + result_max;
+       int *c = b + result_max;
+       int *w = a;
+       int *o = b;
        int recurse_to_leaf;
-       int *w;
        int wsize = 0;
-       int *o;
        int osize;
        int *tmp;
-       struct crush_rule *rule;
+       const struct crush_rule *rule;
        __u32 step;
        int i, j;
        int numrep;
@@ -835,12 +899,10 @@ int crush_do_rule(const struct crush_map *map,
 
        rule = map->rules[ruleno];
        result_len = 0;
-       w = a;
-       o = b;
 
        for (step = 0; step < rule->len; step++) {
                int firstn = 0;
-               struct crush_rule_step *curstep = &rule->steps[step];
+               const struct crush_rule_step *curstep = &rule->steps[step];
 
                switch (curstep->op) {
                case CRUSH_RULE_TAKE:
@@ -936,6 +998,7 @@ int crush_do_rule(const struct crush_map *map,
                                                recurse_tries = choose_tries;
                                        osize += crush_choose_firstn(
                                                map,
+                                               cw,
                                                map->buckets[bno],
                                                weight, weight_max,
                                                x, numrep,
@@ -956,6 +1019,7 @@ int crush_do_rule(const struct crush_map *map,
                                                    numrep : (result_max-osize));
                                        crush_choose_indep(
                                                map,
+                                               cw,
                                                map->buckets[bno],
                                                weight, weight_max,
                                                x, out_size, numrep,
@@ -997,5 +1061,6 @@ int crush_do_rule(const struct crush_map *map,
                        break;
                }
        }
+
        return result_len;
 }
index 292e33bd916e650c0317ab630a0c60a400d21c7d..85747b7f91a91894d4902636d5145dc957184df3 100644 (file)
@@ -3,6 +3,7 @@
 
 #include <linux/err.h>
 #include <linux/scatterlist.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <crypto/aes.h>
 #include <crypto/skcipher.h>
index f3378ba1a82893024b9012c5421099bce87f1824..b65bbf9f45ebb22c8ac51af34c6b1c29ef7ed17c 100644 (file)
@@ -460,7 +460,6 @@ static void request_init(struct ceph_osd_request *req)
 
        kref_init(&req->r_kref);
        init_completion(&req->r_completion);
-       init_completion(&req->r_done_completion);
        RB_CLEAR_NODE(&req->r_node);
        RB_CLEAR_NODE(&req->r_mc_node);
        INIT_LIST_HEAD(&req->r_unsafe_item);
@@ -672,7 +671,8 @@ void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
        BUG_ON(length > previous);
 
        op->extent.length = length;
-       op->indata_len -= previous - length;
+       if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL)
+               op->indata_len -= previous - length;
 }
 EXPORT_SYMBOL(osd_req_op_extent_update);
 
@@ -1636,7 +1636,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
        bool need_send = false;
        bool promoted = false;
 
-       WARN_ON(req->r_tid || req->r_got_reply);
+       WARN_ON(req->r_tid);
        dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
 
 again:
@@ -1704,17 +1704,10 @@ promote:
 
 static void account_request(struct ceph_osd_request *req)
 {
-       unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
+       WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK));
+       WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE)));
 
-       if (req->r_flags & CEPH_OSD_FLAG_READ) {
-               WARN_ON(req->r_flags & mask);
-               req->r_flags |= CEPH_OSD_FLAG_ACK;
-       } else if (req->r_flags & CEPH_OSD_FLAG_WRITE)
-               WARN_ON(!(req->r_flags & mask));
-       else
-               WARN_ON(1);
-
-       WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask);
+       req->r_flags |= CEPH_OSD_FLAG_ONDISK;
        atomic_inc(&req->r_osdc->num_requests);
 }
 
@@ -1749,15 +1742,15 @@ static void finish_request(struct ceph_osd_request *req)
 
 static void __complete_request(struct ceph_osd_request *req)
 {
-       if (req->r_callback)
+       if (req->r_callback) {
+               dout("%s req %p tid %llu cb %pf result %d\n", __func__, req,
+                    req->r_tid, req->r_callback, req->r_result);
                req->r_callback(req);
-       else
-               complete_all(&req->r_completion);
+       }
 }
 
 /*
- * Note that this is open-coded in handle_reply(), which has to deal
- * with ack vs commit, dup acks, etc.
+ * This is open-coded in handle_reply().
  */
 static void complete_request(struct ceph_osd_request *req, int err)
 {
@@ -1766,7 +1759,7 @@ static void complete_request(struct ceph_osd_request *req, int err)
        req->r_result = err;
        finish_request(req);
        __complete_request(req);
-       complete_all(&req->r_done_completion);
+       complete_all(&req->r_completion);
        ceph_osdc_put_request(req);
 }
 
@@ -1792,7 +1785,7 @@ static void cancel_request(struct ceph_osd_request *req)
 
        cancel_map_check(req);
        finish_request(req);
-       complete_all(&req->r_done_completion);
+       complete_all(&req->r_completion);
        ceph_osdc_put_request(req);
 }
 
@@ -2169,7 +2162,6 @@ static void linger_commit_cb(struct ceph_osd_request *req)
        mutex_lock(&lreq->lock);
        dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq,
             lreq->linger_id, req->r_result);
-       WARN_ON(!__linger_registered(lreq));
        linger_reg_commit_complete(lreq, req->r_result);
        lreq->committed = true;
 
@@ -2785,31 +2777,8 @@ e_inval:
 }
 
 /*
- * We are done with @req if
- *   - @m is a safe reply, or
- *   - @m is an unsafe reply and we didn't want a safe one
- */
-static bool done_request(const struct ceph_osd_request *req,
-                        const struct MOSDOpReply *m)
-{
-       return (m->result < 0 ||
-               (m->flags & CEPH_OSD_FLAG_ONDISK) ||
-               !(req->r_flags & CEPH_OSD_FLAG_ONDISK));
-}
-
-/*
- * handle osd op reply.  either call the callback if it is specified,
- * or do the completion to wake up the waiting thread.
- *
- * ->r_unsafe_callback is set? yes                     no
- *
- * first reply is OK (needed   r_cb/r_completion,      r_cb/r_completion,
- * any or needed/got safe)     r_done_completion       r_done_completion
- *
- * first reply is unsafe       r_unsafe_cb(true)       (nothing)
- *
- * when we get the safe reply  r_unsafe_cb(false),     r_cb/r_completion,
- *                             r_done_completion       r_done_completion
+ * Handle MOSDOpReply.  Set ->r_result and call the callback if it is
+ * specified.
  */
 static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
 {
@@ -2818,7 +2787,6 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
        struct MOSDOpReply m;
        u64 tid = le64_to_cpu(msg->hdr.tid);
        u32 data_len = 0;
-       bool already_acked;
        int ret;
        int i;
 
@@ -2897,50 +2865,22 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
                       le32_to_cpu(msg->hdr.data_len), req->r_tid);
                goto fail_request;
        }
-       dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__,
-            req, req->r_tid, req->r_got_reply, m.result, data_len);
-
-       already_acked = req->r_got_reply;
-       if (!already_acked) {
-               req->r_result = m.result ?: data_len;
-               req->r_replay_version = m.replay_version; /* struct */
-               req->r_got_reply = true;
-       } else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) {
-               dout("req %p tid %llu dup ack\n", req, req->r_tid);
-               goto out_unlock_session;
-       }
-
-       if (done_request(req, &m)) {
-               finish_request(req);
-               if (req->r_linger) {
-                       WARN_ON(req->r_unsafe_callback);
-                       dout("req %p tid %llu cb (locked)\n", req, req->r_tid);
-                       __complete_request(req);
-               }
-       }
+       dout("%s req %p tid %llu result %d data_len %u\n", __func__,
+            req, req->r_tid, m.result, data_len);
 
+       /*
+        * Since we only ever request ONDISK, we should only ever get
+        * one (type of) reply back.
+        */
+       WARN_ON(!(m.flags & CEPH_OSD_FLAG_ONDISK));
+       req->r_result = m.result ?: data_len;
+       finish_request(req);
        mutex_unlock(&osd->lock);
        up_read(&osdc->lock);
 
-       if (done_request(req, &m)) {
-               if (already_acked && req->r_unsafe_callback) {
-                       dout("req %p tid %llu safe-cb\n", req, req->r_tid);
-                       req->r_unsafe_callback(req, false);
-               } else if (!req->r_linger) {
-                       dout("req %p tid %llu cb\n", req, req->r_tid);
-                       __complete_request(req);
-               }
-               complete_all(&req->r_done_completion);
-               ceph_osdc_put_request(req);
-       } else {
-               if (req->r_unsafe_callback) {
-                       dout("req %p tid %llu unsafe-cb\n", req, req->r_tid);
-                       req->r_unsafe_callback(req, true);
-               } else {
-                       WARN_ON(1);
-               }
-       }
-
+       __complete_request(req);
+       complete_all(&req->r_completion);
+       ceph_osdc_put_request(req);
        return;
 
 fail_request:
@@ -3540,7 +3480,7 @@ again:
                        up_read(&osdc->lock);
                        dout("%s waiting on req %p tid %llu last_tid %llu\n",
                             __func__, req, req->r_tid, last_tid);
-                       wait_for_completion(&req->r_done_completion);
+                       wait_for_completion(&req->r_completion);
                        ceph_osdc_put_request(req);
                        goto again;
                }
@@ -3599,7 +3539,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc,
 
        ceph_oid_copy(&lreq->t.base_oid, oid);
        ceph_oloc_copy(&lreq->t.base_oloc, oloc);
-       lreq->t.flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+       lreq->t.flags = CEPH_OSD_FLAG_WRITE;
        lreq->mtime = CURRENT_TIME;
 
        lreq->reg_req = alloc_linger_request(lreq);
@@ -3657,7 +3597,7 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
 
        ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
        ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
-       req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+       req->r_flags = CEPH_OSD_FLAG_WRITE;
        req->r_mtime = CURRENT_TIME;
        osd_req_op_watch_init(req, 0, lreq->linger_id,
                              CEPH_OSD_WATCH_OP_UNWATCH);
@@ -4022,7 +3962,7 @@ EXPORT_SYMBOL(ceph_osdc_maybe_request_map);
  * Execute an OSD class method on an object.
  *
  * @flags: CEPH_OSD_FLAG_*
- * @resp_len: out param for reply length
+ * @resp_len: in/out param for reply length
  */
 int ceph_osdc_call(struct ceph_osd_client *osdc,
                   struct ceph_object_id *oid,
@@ -4035,6 +3975,9 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
        struct ceph_osd_request *req;
        int ret;
 
+       if (req_len > PAGE_SIZE || (resp_page && *resp_len > PAGE_SIZE))
+               return -E2BIG;
+
        req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
        if (!req)
                return -ENOMEM;
@@ -4053,7 +3996,7 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
                                                  0, false, false);
        if (resp_page)
                osd_req_op_cls_response_data_pages(req, 0, &resp_page,
-                                                  PAGE_SIZE, 0, false, false);
+                                                  *resp_len, 0, false, false);
 
        ceph_osdc_start_request(osdc, req, false);
        ret = ceph_osdc_wait_request(osdc, req);
@@ -4220,8 +4163,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
        int page_align = off & ~PAGE_MASK;
 
        req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
-                                   CEPH_OSD_OP_WRITE,
-                                   CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
+                                   CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
                                    snapc, truncate_seq, truncate_size,
                                    true);
        if (IS_ERR(req))
index d2436880b3056da8342845103c23aa59c66f8066..6824c0ec8373e721ac9ca2d837f488ff22233e1f 100644 (file)
@@ -153,6 +153,32 @@ bad:
         return -EINVAL;
 }
 
+static void crush_finalize(struct crush_map *c)
+{
+       __s32 b;
+
+       /* Space for the array of pointers to per-bucket workspace */
+       c->working_size = sizeof(struct crush_work) +
+           c->max_buckets * sizeof(struct crush_work_bucket *);
+
+       for (b = 0; b < c->max_buckets; b++) {
+               if (!c->buckets[b])
+                       continue;
+
+               switch (c->buckets[b]->alg) {
+               default:
+                       /*
+                        * The base case, permutation variables and
+                        * the pointer to the permutation array.
+                        */
+                       c->working_size += sizeof(struct crush_work_bucket);
+                       break;
+               }
+               /* Every bucket has a permutation array. */
+               c->working_size += c->buckets[b]->size * sizeof(__u32);
+       }
+}
+
 static struct crush_map *crush_decode(void *pbyval, void *end)
 {
        struct crush_map *c;
@@ -246,10 +272,6 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
                b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
                if (b->items == NULL)
                        goto badmem;
-               b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
-               if (b->perm == NULL)
-                       goto badmem;
-               b->perm_n = 0;
 
                ceph_decode_need(p, end, b->size*sizeof(u32), bad);
                for (j = 0; j < b->size; j++)
@@ -368,6 +390,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
        dout("crush decode tunable chooseleaf_stable = %d\n",
             c->chooseleaf_stable);
 
+       crush_finalize(c);
+
 done:
        dout("crush_decode success\n");
        return c;
@@ -719,7 +743,7 @@ struct ceph_osdmap *ceph_osdmap_alloc(void)
        map->pool_max = -1;
        map->pg_temp = RB_ROOT;
        map->primary_temp = RB_ROOT;
-       mutex_init(&map->crush_scratch_mutex);
+       mutex_init(&map->crush_workspace_mutex);
 
        return map;
 }
@@ -753,6 +777,7 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
        kfree(map->osd_weight);
        kfree(map->osd_addr);
        kfree(map->osd_primary_affinity);
+       kfree(map->crush_workspace);
        kfree(map);
 }
 
@@ -808,6 +833,31 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
        return 0;
 }
 
+static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
+{
+       void *workspace;
+       size_t work_size;
+
+       if (IS_ERR(crush))
+               return PTR_ERR(crush);
+
+       work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE);
+       dout("%s work_size %zu bytes\n", __func__, work_size);
+       workspace = kmalloc(work_size, GFP_NOIO);
+       if (!workspace) {
+               crush_destroy(crush);
+               return -ENOMEM;
+       }
+       crush_init_workspace(crush, workspace);
+
+       if (map->crush)
+               crush_destroy(map->crush);
+       kfree(map->crush_workspace);
+       map->crush = crush;
+       map->crush_workspace = workspace;
+       return 0;
+}
+
 #define OSDMAP_WRAPPER_COMPAT_VER      7
 #define OSDMAP_CLIENT_DATA_COMPAT_VER  1
 
@@ -1214,13 +1264,9 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 
        /* crush */
        ceph_decode_32_safe(p, end, len, e_inval);
-       map->crush = crush_decode(*p, min(*p + len, end));
-       if (IS_ERR(map->crush)) {
-               err = PTR_ERR(map->crush);
-               map->crush = NULL;
+       err = osdmap_set_crush(map, crush_decode(*p, min(*p + len, end)));
+       if (err)
                goto bad;
-       }
-       *p += len;
 
        /* ignore the rest */
        *p = end;
@@ -1375,7 +1421,6 @@ e_inval:
 struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
                                             struct ceph_osdmap *map)
 {
-       struct crush_map *newcrush = NULL;
        struct ceph_fsid fsid;
        u32 epoch = 0;
        struct ceph_timespec modified;
@@ -1414,12 +1459,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
        /* new crush? */
        ceph_decode_32_safe(p, end, len, e_inval);
        if (len > 0) {
-               newcrush = crush_decode(*p, min(*p+len, end));
-               if (IS_ERR(newcrush)) {
-                       err = PTR_ERR(newcrush);
-                       newcrush = NULL;
+               err = osdmap_set_crush(map,
+                                      crush_decode(*p, min(*p + len, end)));
+               if (err)
                        goto bad;
-               }
                *p += len;
        }
 
@@ -1439,12 +1482,6 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 
        map->epoch++;
        map->modified = modified;
-       if (newcrush) {
-               if (map->crush)
-                       crush_destroy(map->crush);
-               map->crush = newcrush;
-               newcrush = NULL;
-       }
 
        /* new_pools */
        err = decode_new_pools(p, end, map);
@@ -1505,8 +1542,6 @@ bad:
        print_hex_dump(KERN_DEBUG, "osdmap: ",
                       DUMP_PREFIX_OFFSET, 16, 1,
                       start, end - start, true);
-       if (newcrush)
-               crush_destroy(newcrush);
        return ERR_PTR(err);
 }
 
@@ -1942,10 +1977,10 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
 
        BUG_ON(result_max > CEPH_PG_MAX_SIZE);
 
-       mutex_lock(&map->crush_scratch_mutex);
+       mutex_lock(&map->crush_workspace_mutex);
        r = crush_do_rule(map->crush, ruleno, x, result, result_max,
-                         weight, weight_max, map->crush_scratch_ary);
-       mutex_unlock(&map->crush_scratch_mutex);
+                         weight, weight_max, map->crush_workspace);
+       mutex_unlock(&map->crush_workspace_mutex);
 
        return r;
 }
@@ -1978,8 +2013,14 @@ static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
                return;
        }
 
-       len = do_crush(osdmap, ruleno, pps, raw->osds,
-                      min_t(int, pi->size, ARRAY_SIZE(raw->osds)),
+       if (pi->size > ARRAY_SIZE(raw->osds)) {
+               pr_err_ratelimited("pool %lld ruleset %d type %d too wide: size %d > %zu\n",
+                      pi->id, pi->crush_ruleset, pi->type, pi->size,
+                      ARRAY_SIZE(raw->osds));
+               return;
+       }
+
+       len = do_crush(osdmap, ruleno, pps, raw->osds, pi->size,
                       osdmap->osd_weight, osdmap->max_osd);
        if (len < 0) {
                pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
index 154683f5f14cdc6e99fa01a4e79e22e69924eef4..705414e78ae0b05d2d1b8d5d8f8e8fbb6007bfb4 100644 (file)
@@ -18,8 +18,6 @@
  * 02110-1301, USA.
  */
 
-#include <stddef.h>
-
 #include <linux/types.h>
 #include <linux/export.h>
 #include <linux/ceph/libceph.h>
index 7db2ad2e82d3193ff1748bf393f536ba3a5a3eb9..b39a791f6756fc831857774b984febc71e37fae3 100644 (file)
@@ -319,7 +319,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
        int ret, no_addr;
        struct fib_result res;
        struct flowi4 fl4;
-       struct net *net;
+       struct net *net = dev_net(dev);
        bool dev_match;
 
        fl4.flowi4_oif = 0;
@@ -332,6 +332,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
        fl4.flowi4_tun_key.tun_id = 0;
        fl4.flowi4_flags = 0;
+       fl4.flowi4_uid = sock_net_uid(net, NULL);
 
        no_addr = idev->ifa_list == NULL;
 
@@ -339,13 +340,12 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 
        trace_fib_validate_source(dev, &fl4);
 
-       net = dev_net(dev);
        if (fib_lookup(net, &fl4, &res, 0))
                goto last_resort;
        if (res.type != RTN_UNICAST &&
            (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
                goto e_inval;
-       if (!rpf && !fib_num_tclassid_users(dev_net(dev)) &&
+       if (!rpf && !fib_num_tclassid_users(net) &&
            (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev)))
                goto last_resort;
        fib_combine_itag(itag, &res);
index cb494a5050f7a9d7cbfc7d961ee5ae0d66196be7..8471dd116771462d149e1da2807e446b69b74bcc 100644 (file)
@@ -1876,6 +1876,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
        fl4.flowi4_flags = 0;
        fl4.daddr = daddr;
        fl4.saddr = saddr;
+       fl4.flowi4_uid = sock_net_uid(net, NULL);
        err = fib_lookup(net, &fl4, &res, 0);
        if (err != 0) {
                if (!IN_DEV_FORWARD(in_dev))
@@ -2008,6 +2009,7 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 {
        int res;
 
+       tos &= IPTOS_RT_MASK;
        rcu_read_lock();
 
        /* Multicast recognition logic is moved from route cache to here.
index c795fee372c4992cf6c391330a85d7d017a7a3a5..644ba59fbd9d5ed8d6ba4a8082dd327589c9bb68 100644 (file)
@@ -693,6 +693,10 @@ vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p)
        u->link = p->link;
        u->i_key = p->i_key;
        u->o_key = p->o_key;
+       if (u->i_key)
+               u->i_flags |= GRE_KEY;
+       if (u->o_key)
+               u->o_flags |= GRE_KEY;
        u->proto = p->proto;
 
        memcpy(u->name, p->name, sizeof(u->name));
index babaf3ec2742bd9ba4a16ad0ec71211a5ed700e1..6ba6c900ebcf430cf313a2bef55ff69c114af218 100644 (file)
@@ -1666,6 +1666,10 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
        struct net *net = sock_net(sk);
        struct mr6_table *mrt;
 
+       if (sk->sk_type != SOCK_RAW ||
+           inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
+               return -EOPNOTSUPP;
+
        mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
        if (!mrt)
                return -ENOENT;
@@ -1677,9 +1681,6 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
 
        switch (optname) {
        case MRT6_INIT:
-               if (sk->sk_type != SOCK_RAW ||
-                   inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
-                       return -EOPNOTSUPP;
                if (optlen < sizeof(int))
                        return -EINVAL;
 
@@ -1815,6 +1816,10 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval,
        struct net *net = sock_net(sk);
        struct mr6_table *mrt;
 
+       if (sk->sk_type != SOCK_RAW ||
+           inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
+               return -EOPNOTSUPP;
+
        mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
        if (!mrt)
                return -ENOENT;
index c59712057dc838170f47f917cc3676f9f06f958b..d25038cfd64e1ae5d5819fe1e7049529f4b5a2e4 100644 (file)
@@ -388,7 +388,7 @@ static int l2tp_ip_backlog_recv(struct sock *sk, struct sk_buff *skb)
 drop:
        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS);
        kfree_skb(skb);
-       return -1;
+       return 0;
 }
 
 /* Userspace will call sendmsg() on the tunnel socket to send L2TP
index e19a69787d994a506ed7e237598aa2cd6c4014ef..4b2e1fb28bb438d695715fc492f52bf7809ade5d 100644 (file)
@@ -410,7 +410,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
        struct net *net = nf_ct_exp_net(expect);
        struct hlist_node *next;
        unsigned int h;
-       int ret = 1;
+       int ret = 0;
 
        if (!master_help) {
                ret = -ESHUTDOWN;
@@ -460,14 +460,14 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,
 
        spin_lock_bh(&nf_conntrack_expect_lock);
        ret = __nf_ct_expect_check(expect);
-       if (ret <= 0)
+       if (ret < 0)
                goto out;
 
        nf_ct_expect_insert(expect);
 
        spin_unlock_bh(&nf_conntrack_expect_lock);
        nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report);
-       return ret;
+       return 0;
 out:
        spin_unlock_bh(&nf_conntrack_expect_lock);
        return ret;
index c6b8022c0e47d43e11a7f2f351f3f0650c9ba604..bf548a7a71ec9b49cf308af041811d2eb5f33c8c 100644 (file)
@@ -528,6 +528,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
                if (!nft_ct_tmpl_alloc_pcpu())
                        return -ENOMEM;
                nft_ct_pcpu_template_refcnt++;
+               len = sizeof(u16);
                break;
 #endif
        default:
index 97f9649bcc7e8b26969e29dab5b00ad28d5db6c3..152d226552c174929fd8973f023eaac888e4b0a9 100644 (file)
@@ -258,7 +258,7 @@ static int nft_bitmap_init(const struct nft_set *set,
 {
        struct nft_bitmap *priv = nft_set_priv(set);
 
-       priv->bitmap_size = nft_bitmap_total_size(set->klen);
+       priv->bitmap_size = nft_bitmap_size(set->klen);
 
        return 0;
 }
index 8d70884d7bb60294c1402892bff3ebe4c81d3663..91fe46f1e4ccf018a554c149a5ce3e804dc9991a 100644 (file)
@@ -111,8 +111,7 @@ static void rds_ib_dev_free(struct work_struct *work)
                kfree(i_ipaddr);
        }
 
-       if (rds_ibdev->vector_load)
-               kfree(rds_ibdev->vector_load);
+       kfree(rds_ibdev->vector_load);
 
        kfree(rds_ibdev);
 }
index 57bb52361e0fda2d3ca4ace76b64a02dc32d1454..5438f6725092b7962613f3cf0203e89f16735e34 100644 (file)
@@ -641,12 +641,12 @@ static int rds_tcp_init(void)
        ret = register_netdevice_notifier(&rds_tcp_dev_notifier);
        if (ret) {
                pr_warn("could not register rds_tcp_dev_notifier\n");
-               goto out;
+               goto out_slab;
        }
 
        ret = register_pernet_subsys(&rds_tcp_net_ops);
        if (ret)
-               goto out_slab;
+               goto out_notifier;
 
        ret = rds_tcp_recv_init();
        if (ret)
@@ -664,9 +664,10 @@ out_recv:
        rds_tcp_recv_exit();
 out_pernet:
        unregister_pernet_subsys(&rds_tcp_net_ops);
-out_slab:
+out_notifier:
        if (unregister_netdevice_notifier(&rds_tcp_dev_notifier))
                pr_warn("could not unregister rds_tcp_dev_notifier\n");
+out_slab:
        kmem_cache_destroy(rds_tcp_conn_slab);
 out:
        return ret;
index 18c737a61d8060a4a8608403defca2a72058dfdc..0a4e28477ad94012b4e7aeb526711076e6259995 100644 (file)
@@ -1065,7 +1065,7 @@ static long rxrpc_read(const struct key *key,
 
                switch (token->security_index) {
                case RXRPC_SECURITY_RXKAD:
-                       toksize += 8 * 4;       /* viceid, kvno, key*2, begin,
+                       toksize += 9 * 4;       /* viceid, kvno, key*2 + len, begin,
                                                 * end, primary, tktlen */
                        toksize += RND(token->kad->ticket_len);
                        break;
index c29362d50a92b7de9771e94d8af375944ee168d0..f3a688e108430a9e9d32e822e54df0700940aacf 100644 (file)
@@ -320,8 +320,10 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 
        /* Barriers against rxrpc_input_data(). */
        hard_ack = call->rx_hard_ack;
-       top = smp_load_acquire(&call->rx_top);
-       for (seq = hard_ack + 1; before_eq(seq, top); seq++) {
+       seq = hard_ack + 1;
+       while (top = smp_load_acquire(&call->rx_top),
+              before_eq(seq, top)
+              ) {
                ix = seq & RXRPC_RXTX_BUFF_MASK;
                skb = call->rxtx_buffer[ix];
                if (!skb) {
@@ -394,6 +396,8 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
                        ret = 1;
                        goto out;
                }
+
+               seq++;
        }
 
 out:
index f219ff325ed4b816866e46a66c97c6f7724ec81f..b70aa57319ea3233395dc7ae349b8b7aab5dfd03 100644 (file)
@@ -613,8 +613,8 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
                        goto err_mod;
                }
 
-               err = nla_memdup_cookie(a, tb);
-               if (err < 0) {
+               if (nla_memdup_cookie(a, tb) < 0) {
+                       err = -ENOMEM;
                        tcf_hash_release(a, bind);
                        goto err_mod;
                }
@@ -859,10 +859,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
                goto out_module_put;
 
        err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops);
-       if (err < 0)
+       if (err <= 0)
                goto out_module_put;
-       if (err == 0)
-               goto noflush_out;
 
        nla_nest_end(skb, nest);
 
@@ -879,7 +877,6 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
 out_module_put:
        module_put(ops->owner);
 err_out:
-noflush_out:
        kfree_skb(skb);
        return err;
 }
index 8227bbbd077a4ad04df53886558d34c4d6a3daea..1b6d4574d2b02a2877caba604bb549352a0f0470 100644 (file)
@@ -199,6 +199,7 @@ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp,
                              sctp_scope_t scope, gfp_t gfp, int copy_flags)
 {
        struct sctp_sockaddr_entry *addr;
+       union sctp_addr laddr;
        int error = 0;
 
        rcu_read_lock();
@@ -220,7 +221,10 @@ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp,
                     !(copy_flags & SCTP_ADDR6_PEERSUPP)))
                        continue;
 
-               if (sctp_bind_addr_state(bp, &addr->a) != -1)
+               laddr = addr->a;
+               /* also works for setting ipv6 address port */
+               laddr.v4.sin_port = htons(bp->port);
+               if (sctp_bind_addr_state(bp, &laddr) != -1)
                        continue;
 
                error = sctp_add_bind_addr(bp, &addr->a, sizeof(addr->a),
index b5321486fbed300106844584d1764181342d8e35..465a9c8464f9477f827c14cbdab6879ac181a507 100644 (file)
@@ -4862,6 +4862,12 @@ int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp)
        if (!asoc)
                return -EINVAL;
 
+       /* If there is a thread waiting on more sndbuf space for
+        * sending on this asoc, it cannot be peeled.
+        */
+       if (waitqueue_active(&asoc->wait))
+               return -EBUSY;
+
        /* An association cannot be branched off from an already peeled-off
         * socket, nor is this supported for tcp style sockets.
         */
@@ -7599,8 +7605,6 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
                 */
                release_sock(sk);
                current_timeo = schedule_timeout(current_timeo);
-               if (sk != asoc->base.sk)
-                       goto do_error;
                lock_sock(sk);
 
                *timeo_p = current_timeo;
index 1530825985221a1aeb5f77ee81f4251acdef9d96..a54a7a3d28f5300e7940769b1b3bc0b5daa7cfbb 100644 (file)
@@ -1489,8 +1489,8 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
        case RPC_GSS_PROC_DESTROY:
                if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq))
                        goto auth_err;
-               rsci->h.expiry_time = seconds_since_boot();
-               set_bit(CACHE_NEGATIVE, &rsci->h.flags);
+               /* Delete the entry from the cache_list and call cache_put */
+               sunrpc_cache_unhash(sn->rsc_cache, &rsci->h);
                if (resv->iov_len + 4 > PAGE_SIZE)
                        goto drop;
                svc_putnl(resv, RPC_SUCCESS);
index f39e3e11f9aa283698ced6a8ca92fed5f68140e5..d8639da06d9cd4815a407ef4dec4340bee68caf4 100644 (file)
@@ -362,11 +362,6 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd)
        cache_purge(cd);
        spin_lock(&cache_list_lock);
        write_lock(&cd->hash_lock);
-       if (cd->entries) {
-               write_unlock(&cd->hash_lock);
-               spin_unlock(&cache_list_lock);
-               goto out;
-       }
        if (current_detail == cd)
                current_detail = NULL;
        list_del_init(&cd->others);
@@ -376,9 +371,6 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd)
                /* module must be being unloaded so its safe to kill the worker */
                cancel_delayed_work_sync(&cache_cleaner);
        }
-       return;
-out:
-       printk(KERN_ERR "RPC: failed to unregister %s cache\n", cd->name);
 }
 EXPORT_SYMBOL_GPL(sunrpc_destroy_cache_detail);
 
@@ -497,13 +489,32 @@ EXPORT_SYMBOL_GPL(cache_flush);
 
 void cache_purge(struct cache_detail *detail)
 {
-       time_t now = seconds_since_boot();
-       if (detail->flush_time >= now)
-               now = detail->flush_time + 1;
-       /* 'now' is the maximum value any 'last_refresh' can have */
-       detail->flush_time = now;
-       detail->nextcheck = seconds_since_boot();
-       cache_flush();
+       struct cache_head *ch = NULL;
+       struct hlist_head *head = NULL;
+       struct hlist_node *tmp = NULL;
+       int i = 0;
+
+       write_lock(&detail->hash_lock);
+       if (!detail->entries) {
+               write_unlock(&detail->hash_lock);
+               return;
+       }
+
+       dprintk("RPC: %d entries in %s cache\n", detail->entries, detail->name);
+       for (i = 0; i < detail->hash_size; i++) {
+               head = &detail->hash_table[i];
+               hlist_for_each_entry_safe(ch, tmp, head, cache_list) {
+                       hlist_del_init(&ch->cache_list);
+                       detail->entries--;
+
+                       set_bit(CACHE_CLEANED, &ch->flags);
+                       write_unlock(&detail->hash_lock);
+                       cache_fresh_unlocked(ch, detail);
+                       cache_put(ch, detail);
+                       write_lock(&detail->hash_lock);
+               }
+       }
+       write_unlock(&detail->hash_lock);
 }
 EXPORT_SYMBOL_GPL(cache_purge);
 
@@ -1855,3 +1866,15 @@ void sunrpc_cache_unregister_pipefs(struct cache_detail *cd)
 }
 EXPORT_SYMBOL_GPL(sunrpc_cache_unregister_pipefs);
 
+void sunrpc_cache_unhash(struct cache_detail *cd, struct cache_head *h)
+{
+       write_lock(&cd->hash_lock);
+       if (!hlist_unhashed(&h->cache_list)){
+               hlist_del_init(&h->cache_list);
+               cd->entries--;
+               write_unlock(&cd->hash_lock);
+               cache_put(h, cd);
+       } else
+               write_unlock(&cd->hash_lock);
+}
+EXPORT_SYMBOL_GPL(sunrpc_cache_unhash);
index 2e22889a8837bd6d0dc3be1d1d6f8528829c0080..b94efd93d3e498a94bec4fee5eec8b9748052bdb 100644 (file)
@@ -385,7 +385,7 @@ static int svc_uses_rpcbind(struct svc_serv *serv)
                for (i = 0; i < progp->pg_nvers; i++) {
                        if (progp->pg_vers[i] == NULL)
                                continue;
-                       if (progp->pg_vers[i]->vs_hidden == 0)
+                       if (!progp->pg_vers[i]->vs_hidden)
                                return 1;
                }
        }
@@ -976,6 +976,13 @@ int svc_register(const struct svc_serv *serv, struct net *net,
                        if (vers->vs_hidden)
                                continue;
 
+                       /*
+                        * Don't register a UDP port if we need congestion
+                        * control.
+                        */
+                       if (vers->vs_need_cong_ctrl && proto == IPPROTO_UDP)
+                               continue;
+
                        error = __svc_register(net, progp->pg_name, progp->pg_prog,
                                                i, family, proto, port);
 
@@ -1169,6 +1176,21 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
          !(versp = progp->pg_vers[vers]))
                goto err_bad_vers;
 
+       /*
+        * Some protocol versions (namely NFSv4) require some form of
+        * congestion control.  (See RFC 7530 section 3.1 paragraph 2)
+        * In other words, UDP is not allowed. We mark those when setting
+        * up the svc_xprt, and verify that here.
+        *
+        * The spec is not very clear about what error should be returned
+        * when someone tries to access a server that is listening on UDP
+        * for lower versions. RPC_PROG_MISMATCH seems to be the closest
+        * fit.
+        */
+       if (versp->vs_need_cong_ctrl &&
+           !test_bit(XPT_CONG_CTRL, &rqstp->rq_xprt->xpt_flags))
+               goto err_bad_vers;
+
        procp = versp->vs_proc + proc;
        if (proc >= versp->vs_nproc || !procp->pc_func)
                goto err_bad_proc;
index d227d97f7ad4d3b3102b6c329dcfe925502fc471..8931e33b65412d7b8bbe8b3872e5f7d7b27d92d5 100644 (file)
@@ -1306,6 +1306,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
        svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_tcp_class,
                      &svsk->sk_xprt, serv);
        set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
+       set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags);
        if (sk->sk_state == TCP_LISTEN) {
                dprintk("setting up TCP socket for listening\n");
                set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
index cb1e48e54eb1440181976a352229783f202f896d..ff1df40f0d261bc956f1af3410d8780f4c582b83 100644 (file)
@@ -201,19 +201,20 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst)
 {
        struct rpc_xprt *xprt = rqst->rq_xprt;
        struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
-       struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)rqst->rq_buffer;
+       __be32 *p;
        int rc;
 
        /* Space in the send buffer for an RPC/RDMA header is reserved
         * via xprt->tsh_size.
         */
-       headerp->rm_xid = rqst->rq_xid;
-       headerp->rm_vers = rpcrdma_version;
-       headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests);
-       headerp->rm_type = rdma_msg;
-       headerp->rm_body.rm_chunks[0] = xdr_zero;
-       headerp->rm_body.rm_chunks[1] = xdr_zero;
-       headerp->rm_body.rm_chunks[2] = xdr_zero;
+       p = rqst->rq_buffer;
+       *p++ = rqst->rq_xid;
+       *p++ = rpcrdma_version;
+       *p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests);
+       *p++ = rdma_msg;
+       *p++ = xdr_zero;
+       *p++ = xdr_zero;
+       *p   = xdr_zero;
 
 #ifdef SVCRDMA_BACKCHANNEL_DEBUG
        pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer);
index 0ba9887f3e22bab9a1e3e809df5c4e2c23a510fe..1c4aabf0f65772c13265421262feb030ab4a58ca 100644 (file)
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 2016 Oracle. All rights reserved.
  * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
 
 #define RPCDBG_FACILITY        RPCDBG_SVCXPRT
 
-/*
- * Decodes a read chunk list. The expected format is as follows:
- *    descrim  : xdr_one
- *    position : __be32 offset into XDR stream
- *    handle   : __be32 RKEY
- *    . . .
- *  end-of-list: xdr_zero
- */
-static __be32 *decode_read_list(__be32 *va, __be32 *vaend)
+static __be32 *xdr_check_read_list(__be32 *p, __be32 *end)
 {
-       struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va;
+       __be32 *next;
 
-       while (ch->rc_discrim != xdr_zero) {
-               if (((unsigned long)ch + sizeof(struct rpcrdma_read_chunk)) >
-                   (unsigned long)vaend) {
-                       dprintk("svcrdma: vaend=%p, ch=%p\n", vaend, ch);
+       while (*p++ != xdr_zero) {
+               next = p + rpcrdma_readchunk_maxsz - 1;
+               if (next > end)
                        return NULL;
-               }
-               ch++;
+               p = next;
        }
-       return &ch->rc_position;
+       return p;
 }
 
-/*
- * Decodes a write chunk list. The expected format is as follows:
- *    descrim  : xdr_one
- *    nchunks  : <count>
- *       handle   : __be32 RKEY           ---+
- *       length   : __be32 <len of segment>  |
- *       offset   : remove va                + <count>
- *       . . .                               |
- *                                        ---+
- */
-static __be32 *decode_write_list(__be32 *va, __be32 *vaend)
+static __be32 *xdr_check_write_list(__be32 *p, __be32 *end)
 {
-       unsigned long start, end;
-       int nchunks;
-
-       struct rpcrdma_write_array *ary =
-               (struct rpcrdma_write_array *)va;
+       __be32 *next;
 
-       /* Check for not write-array */
-       if (ary->wc_discrim == xdr_zero)
-               return &ary->wc_nchunks;
-
-       if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
-           (unsigned long)vaend) {
-               dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
-               return NULL;
-       }
-       nchunks = be32_to_cpu(ary->wc_nchunks);
-
-       start = (unsigned long)&ary->wc_array[0];
-       end = (unsigned long)vaend;
-       if (nchunks < 0 ||
-           nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) ||
-           (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) {
-               dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
-                       ary, nchunks, vaend);
-               return NULL;
+       while (*p++ != xdr_zero) {
+               next = p + 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz;
+               if (next > end)
+                       return NULL;
+               p = next;
        }
-       /*
-        * rs_length is the 2nd 4B field in wc_target and taking its
-        * address skips the list terminator
-        */
-       return &ary->wc_array[nchunks].wc_target.rs_length;
+       return p;
 }
 
-static __be32 *decode_reply_array(__be32 *va, __be32 *vaend)
+static __be32 *xdr_check_reply_chunk(__be32 *p, __be32 *end)
 {
-       unsigned long start, end;
-       int nchunks;
-       struct rpcrdma_write_array *ary =
-               (struct rpcrdma_write_array *)va;
-
-       /* Check for no reply-array */
-       if (ary->wc_discrim == xdr_zero)
-               return &ary->wc_nchunks;
-
-       if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
-           (unsigned long)vaend) {
-               dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
-               return NULL;
-       }
-       nchunks = be32_to_cpu(ary->wc_nchunks);
-
-       start = (unsigned long)&ary->wc_array[0];
-       end = (unsigned long)vaend;
-       if (nchunks < 0 ||
-           nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) ||
-           (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) {
-               dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
-                       ary, nchunks, vaend);
-               return NULL;
+       __be32 *next;
+
+       if (*p++ != xdr_zero) {
+               next = p + 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz;
+               if (next > end)
+                       return NULL;
+               p = next;
        }
-       return (__be32 *)&ary->wc_array[nchunks];
+       return p;
 }
 
 /**
@@ -158,87 +100,71 @@ static __be32 *decode_reply_array(__be32 *va, __be32 *vaend)
  */
 int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg)
 {
-       struct rpcrdma_msg *rmsgp;
-       __be32 *va, *vaend;
-       unsigned int len;
-       u32 hdr_len;
+       __be32 *p, *end, *rdma_argp;
+       unsigned int hdr_len;
 
        /* Verify that there's enough bytes for header + something */
-       if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) {
-               dprintk("svcrdma: header too short = %d\n",
-                       rq_arg->len);
-               return -EINVAL;
-       }
+       if (rq_arg->len <= RPCRDMA_HDRLEN_ERR)
+               goto out_short;
 
-       rmsgp = (struct rpcrdma_msg *)rq_arg->head[0].iov_base;
-       if (rmsgp->rm_vers != rpcrdma_version) {
-               dprintk("%s: bad version %u\n", __func__,
-                       be32_to_cpu(rmsgp->rm_vers));
-               return -EPROTONOSUPPORT;
-       }
+       rdma_argp = rq_arg->head[0].iov_base;
+       if (*(rdma_argp + 1) != rpcrdma_version)
+               goto out_version;
 
-       switch (be32_to_cpu(rmsgp->rm_type)) {
-       case RDMA_MSG:
-       case RDMA_NOMSG:
+       switch (*(rdma_argp + 3)) {
+       case rdma_msg:
+       case rdma_nomsg:
                break;
 
-       case RDMA_DONE:
-               /* Just drop it */
-               dprintk("svcrdma: dropping RDMA_DONE message\n");
-               return 0;
-
-       case RDMA_ERROR:
-               /* Possible if this is a backchannel reply.
-                * XXX: We should cancel this XID, though.
-                */
-               dprintk("svcrdma: dropping RDMA_ERROR message\n");
-               return 0;
-
-       case RDMA_MSGP:
-               /* Pull in the extra for the padded case, bump our pointer */
-               rmsgp->rm_body.rm_padded.rm_align =
-                       be32_to_cpu(rmsgp->rm_body.rm_padded.rm_align);
-               rmsgp->rm_body.rm_padded.rm_thresh =
-                       be32_to_cpu(rmsgp->rm_body.rm_padded.rm_thresh);
-
-               va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
-               rq_arg->head[0].iov_base = va;
-               len = (u32)((unsigned long)va - (unsigned long)rmsgp);
-               rq_arg->head[0].iov_len -= len;
-               if (len > rq_arg->len)
-                       return -EINVAL;
-               return len;
-       default:
-               dprintk("svcrdma: bad rdma procedure (%u)\n",
-                       be32_to_cpu(rmsgp->rm_type));
-               return -EINVAL;
-       }
+       case rdma_done:
+               goto out_drop;
 
-       /* The chunk list may contain either a read chunk list or a write
-        * chunk list and a reply chunk list.
-        */
-       va = &rmsgp->rm_body.rm_chunks[0];
-       vaend = (__be32 *)((unsigned long)rmsgp + rq_arg->len);
-       va = decode_read_list(va, vaend);
-       if (!va) {
-               dprintk("svcrdma: failed to decode read list\n");
-               return -EINVAL;
-       }
-       va = decode_write_list(va, vaend);
-       if (!va) {
-               dprintk("svcrdma: failed to decode write list\n");
-               return -EINVAL;
-       }
-       va = decode_reply_array(va, vaend);
-       if (!va) {
-               dprintk("svcrdma: failed to decode reply chunk\n");
-               return -EINVAL;
+       case rdma_error:
+               goto out_drop;
+
+       default:
+               goto out_proc;
        }
 
-       rq_arg->head[0].iov_base = va;
-       hdr_len = (unsigned long)va - (unsigned long)rmsgp;
+       end = (__be32 *)((unsigned long)rdma_argp + rq_arg->len);
+       p = xdr_check_read_list(rdma_argp + 4, end);
+       if (!p)
+               goto out_inval;
+       p = xdr_check_write_list(p, end);
+       if (!p)
+               goto out_inval;
+       p = xdr_check_reply_chunk(p, end);
+       if (!p)
+               goto out_inval;
+       if (p > end)
+               goto out_inval;
+
+       rq_arg->head[0].iov_base = p;
+       hdr_len = (unsigned long)p - (unsigned long)rdma_argp;
        rq_arg->head[0].iov_len -= hdr_len;
        return hdr_len;
+
+out_short:
+       dprintk("svcrdma: header too short = %d\n", rq_arg->len);
+       return -EINVAL;
+
+out_version:
+       dprintk("svcrdma: bad xprt version: %u\n",
+               be32_to_cpup(rdma_argp + 1));
+       return -EPROTONOSUPPORT;
+
+out_drop:
+       dprintk("svcrdma: dropping RDMA_DONE/ERROR message\n");
+       return 0;
+
+out_proc:
+       dprintk("svcrdma: bad rdma procedure (%u)\n",
+               be32_to_cpup(rdma_argp + 3));
+       return -EINVAL;
+
+out_inval:
+       dprintk("svcrdma: failed to parse transport header\n");
+       return -EINVAL;
 }
 
 int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
@@ -249,7 +175,7 @@ int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
 
        *va++ = rmsgp->rm_xid;
        *va++ = rmsgp->rm_vers;
-       *va++ = cpu_to_be32(xprt->sc_max_requests);
+       *va++ = xprt->sc_fc_credits;
        *va++ = rdma_error;
        *va++ = cpu_to_be32(err);
        if (err == ERR_VERS) {
@@ -260,32 +186,35 @@ int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
        return (int)((unsigned long)va - (unsigned long)startp);
 }
 
-int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp)
+/**
+ * svc_rdma_xdr_get_reply_hdr_length - Get length of Reply transport header
+ * @rdma_resp: buffer containing Reply transport header
+ *
+ * Returns length of transport header, in bytes.
+ */
+unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp)
 {
-       struct rpcrdma_write_array *wr_ary;
+       unsigned int nsegs;
+       __be32 *p;
 
-       /* There is no read-list in a reply */
+       p = rdma_resp;
 
-       /* skip write list */
-       wr_ary = (struct rpcrdma_write_array *)
-               &rmsgp->rm_body.rm_chunks[1];
-       if (wr_ary->wc_discrim)
-               wr_ary = (struct rpcrdma_write_array *)
-                       &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)].
-                       wc_target.rs_length;
-       else
-               wr_ary = (struct rpcrdma_write_array *)
-                       &wr_ary->wc_nchunks;
-
-       /* skip reply array */
-       if (wr_ary->wc_discrim)
-               wr_ary = (struct rpcrdma_write_array *)
-                       &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)];
-       else
-               wr_ary = (struct rpcrdma_write_array *)
-                       &wr_ary->wc_nchunks;
-
-       return (unsigned long) wr_ary - (unsigned long) rmsgp;
+       /* RPC-over-RDMA V1 replies never have a Read list. */
+       p += rpcrdma_fixed_maxsz + 1;
+
+       /* Skip Write list. */
+       while (*p++ != xdr_zero) {
+               nsegs = be32_to_cpup(p++);
+               p += nsegs * rpcrdma_segment_maxsz;
+       }
+
+       /* Skip Reply chunk. */
+       if (*p++ != xdr_zero) {
+               nsegs = be32_to_cpup(p++);
+               p += nsegs * rpcrdma_segment_maxsz;
+       }
+
+       return (unsigned long)p - (unsigned long)rdma_resp;
 }
 
 void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks)
@@ -326,19 +255,3 @@ void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
        seg->rs_offset = rs_offset;
        seg->rs_length = cpu_to_be32(write_len);
 }
-
-void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt,
-                                 struct rpcrdma_msg *rdma_argp,
-                                 struct rpcrdma_msg *rdma_resp,
-                                 enum rpcrdma_proc rdma_type)
-{
-       rdma_resp->rm_xid = rdma_argp->rm_xid;
-       rdma_resp->rm_vers = rdma_argp->rm_vers;
-       rdma_resp->rm_credit = cpu_to_be32(xprt->sc_max_requests);
-       rdma_resp->rm_type = cpu_to_be32(rdma_type);
-
-       /* Encode <nul> chunks lists */
-       rdma_resp->rm_body.rm_chunks[0] = xdr_zero;
-       rdma_resp->rm_body.rm_chunks[1] = xdr_zero;
-       rdma_resp->rm_body.rm_chunks[2] = xdr_zero;
-}
index 172b537f8cfc942ef62574b74cff7ac5f421fba9..f7b2daf72a86582807798379ac3be336b061a958 100644 (file)
@@ -606,26 +606,24 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 
        dprintk("svcrdma: rqstp=%p\n", rqstp);
 
-       spin_lock_bh(&rdma_xprt->sc_rq_dto_lock);
+       spin_lock(&rdma_xprt->sc_rq_dto_lock);
        if (!list_empty(&rdma_xprt->sc_read_complete_q)) {
-               ctxt = list_entry(rdma_xprt->sc_read_complete_q.next,
-                                 struct svc_rdma_op_ctxt,
-                                 dto_q);
-               list_del_init(&ctxt->dto_q);
-               spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
+               ctxt = list_first_entry(&rdma_xprt->sc_read_complete_q,
+                                       struct svc_rdma_op_ctxt, list);
+               list_del(&ctxt->list);
+               spin_unlock(&rdma_xprt->sc_rq_dto_lock);
                rdma_read_complete(rqstp, ctxt);
                goto complete;
        } else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
-               ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next,
-                                 struct svc_rdma_op_ctxt,
-                                 dto_q);
-               list_del_init(&ctxt->dto_q);
+               ctxt = list_first_entry(&rdma_xprt->sc_rq_dto_q,
+                                       struct svc_rdma_op_ctxt, list);
+               list_del(&ctxt->list);
        } else {
                atomic_inc(&rdma_stat_rq_starve);
                clear_bit(XPT_DATA, &xprt->xpt_flags);
                ctxt = NULL;
        }
-       spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
+       spin_unlock(&rdma_xprt->sc_rq_dto_lock);
        if (!ctxt) {
                /* This is the EAGAIN path. The svc_recv routine will
                 * return -EAGAIN, the nfsd thread will go to call into
index ad4d286a83c5195fe663dd581cd49a5c9f9a6166..515221b16d0956ea027e91985c89606c403d5109 100644 (file)
@@ -476,7 +476,8 @@ static int send_reply(struct svcxprt_rdma *rdma,
 
        /* Prepare the SGE for the RPCRDMA Header */
        ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
-       ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
+       ctxt->sge[0].length =
+           svc_rdma_xdr_get_reply_hdr_len((__be32 *)rdma_resp);
        ctxt->sge[0].addr =
            ib_dma_map_page(rdma->sc_cm_id->device, page, 0,
                            ctxt->sge[0].length, DMA_TO_DEVICE);
@@ -559,12 +560,12 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
        struct rpcrdma_msg *rdma_argp;
        struct rpcrdma_msg *rdma_resp;
        struct rpcrdma_write_array *wr_ary, *rp_ary;
-       enum rpcrdma_proc reply_type;
        int ret;
        int inline_bytes;
        struct page *res_page;
        struct svc_rdma_req_map *vec;
        u32 inv_rkey;
+       __be32 *p;
 
        dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
 
@@ -596,12 +597,17 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
        if (!res_page)
                goto err0;
        rdma_resp = page_address(res_page);
-       if (rp_ary)
-               reply_type = RDMA_NOMSG;
-       else
-               reply_type = RDMA_MSG;
-       svc_rdma_xdr_encode_reply_header(rdma, rdma_argp,
-                                        rdma_resp, reply_type);
+
+       p = &rdma_resp->rm_xid;
+       *p++ = rdma_argp->rm_xid;
+       *p++ = rdma_argp->rm_vers;
+       *p++ = rdma->sc_fc_credits;
+       *p++ = rp_ary ? rdma_nomsg : rdma_msg;
+
+       /* Start with empty chunks */
+       *p++ = xdr_zero;
+       *p++ = xdr_zero;
+       *p   = xdr_zero;
 
        /* Send any write-chunk data and build resp write-list */
        if (wr_ary) {
index 39652d390a9c60bc026199a7dcb5ef996bcd65ab..c13a5c35ce14d992515fa99e456976ed0cd1c382 100644 (file)
@@ -157,8 +157,7 @@ static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt,
        ctxt = kmalloc(sizeof(*ctxt), flags);
        if (ctxt) {
                ctxt->xprt = xprt;
-               INIT_LIST_HEAD(&ctxt->free);
-               INIT_LIST_HEAD(&ctxt->dto_q);
+               INIT_LIST_HEAD(&ctxt->list);
        }
        return ctxt;
 }
@@ -180,7 +179,7 @@ static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt)
                        dprintk("svcrdma: No memory for RDMA ctxt\n");
                        return false;
                }
-               list_add(&ctxt->free, &xprt->sc_ctxts);
+               list_add(&ctxt->list, &xprt->sc_ctxts);
        }
        return true;
 }
@@ -189,15 +188,15 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
 {
        struct svc_rdma_op_ctxt *ctxt = NULL;
 
-       spin_lock_bh(&xprt->sc_ctxt_lock);
+       spin_lock(&xprt->sc_ctxt_lock);
        xprt->sc_ctxt_used++;
        if (list_empty(&xprt->sc_ctxts))
                goto out_empty;
 
        ctxt = list_first_entry(&xprt->sc_ctxts,
-                               struct svc_rdma_op_ctxt, free);
-       list_del_init(&ctxt->free);
-       spin_unlock_bh(&xprt->sc_ctxt_lock);
+                               struct svc_rdma_op_ctxt, list);
+       list_del(&ctxt->list);
+       spin_unlock(&xprt->sc_ctxt_lock);
 
 out:
        ctxt->count = 0;
@@ -209,15 +208,15 @@ out_empty:
        /* Either pre-allocation missed the mark, or send
         * queue accounting is broken.
         */
-       spin_unlock_bh(&xprt->sc_ctxt_lock);
+       spin_unlock(&xprt->sc_ctxt_lock);
 
        ctxt = alloc_ctxt(xprt, GFP_NOIO);
        if (ctxt)
                goto out;
 
-       spin_lock_bh(&xprt->sc_ctxt_lock);
+       spin_lock(&xprt->sc_ctxt_lock);
        xprt->sc_ctxt_used--;
-       spin_unlock_bh(&xprt->sc_ctxt_lock);
+       spin_unlock(&xprt->sc_ctxt_lock);
        WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n");
        return NULL;
 }
@@ -254,10 +253,10 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
                for (i = 0; i < ctxt->count; i++)
                        put_page(ctxt->pages[i]);
 
-       spin_lock_bh(&xprt->sc_ctxt_lock);
+       spin_lock(&xprt->sc_ctxt_lock);
        xprt->sc_ctxt_used--;
-       list_add(&ctxt->free, &xprt->sc_ctxts);
-       spin_unlock_bh(&xprt->sc_ctxt_lock);
+       list_add(&ctxt->list, &xprt->sc_ctxts);
+       spin_unlock(&xprt->sc_ctxt_lock);
 }
 
 static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
@@ -266,8 +265,8 @@ static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
                struct svc_rdma_op_ctxt *ctxt;
 
                ctxt = list_first_entry(&xprt->sc_ctxts,
-                                       struct svc_rdma_op_ctxt, free);
-               list_del(&ctxt->free);
+                                       struct svc_rdma_op_ctxt, list);
+               list_del(&ctxt->list);
                kfree(ctxt);
        }
 }
@@ -404,7 +403,7 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
        /* All wc fields are now known to be valid */
        ctxt->byte_len = wc->byte_len;
        spin_lock(&xprt->sc_rq_dto_lock);
-       list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q);
+       list_add_tail(&ctxt->list, &xprt->sc_rq_dto_q);
        spin_unlock(&xprt->sc_rq_dto_lock);
 
        set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
@@ -525,7 +524,7 @@ void svc_rdma_wc_read(struct ib_cq *cq, struct ib_wc *wc)
 
                read_hdr = ctxt->read_hdr;
                spin_lock(&xprt->sc_rq_dto_lock);
-               list_add_tail(&read_hdr->dto_q,
+               list_add_tail(&read_hdr->list,
                              &xprt->sc_read_complete_q);
                spin_unlock(&xprt->sc_rq_dto_lock);
 
@@ -557,7 +556,6 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
                return NULL;
        svc_xprt_init(&init_net, &svc_rdma_class, &cma_xprt->sc_xprt, serv);
        INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
-       INIT_LIST_HEAD(&cma_xprt->sc_dto_q);
        INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
        INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
        INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
@@ -571,6 +569,14 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
        spin_lock_init(&cma_xprt->sc_ctxt_lock);
        spin_lock_init(&cma_xprt->sc_map_lock);
 
+       /*
+        * Note that this implies that the underlying transport support
+        * has some form of congestion control (see RFC 7530 section 3.1
+        * paragraph 2). For now, we assume that all supported RDMA
+        * transports are suitable here.
+        */
+       set_bit(XPT_CONG_CTRL, &cma_xprt->sc_xprt.xpt_flags);
+
        if (listener)
                set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
 
@@ -923,14 +929,14 @@ struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma)
 {
        struct svc_rdma_fastreg_mr *frmr = NULL;
 
-       spin_lock_bh(&rdma->sc_frmr_q_lock);
+       spin_lock(&rdma->sc_frmr_q_lock);
        if (!list_empty(&rdma->sc_frmr_q)) {
                frmr = list_entry(rdma->sc_frmr_q.next,
                                  struct svc_rdma_fastreg_mr, frmr_list);
                list_del_init(&frmr->frmr_list);
                frmr->sg_nents = 0;
        }
-       spin_unlock_bh(&rdma->sc_frmr_q_lock);
+       spin_unlock(&rdma->sc_frmr_q_lock);
        if (frmr)
                return frmr;
 
@@ -943,10 +949,10 @@ void svc_rdma_put_frmr(struct svcxprt_rdma *rdma,
        if (frmr) {
                ib_dma_unmap_sg(rdma->sc_cm_id->device,
                                frmr->sg, frmr->sg_nents, frmr->direction);
-               spin_lock_bh(&rdma->sc_frmr_q_lock);
+               spin_lock(&rdma->sc_frmr_q_lock);
                WARN_ON_ONCE(!list_empty(&frmr->frmr_list));
                list_add(&frmr->frmr_list, &rdma->sc_frmr_q);
-               spin_unlock_bh(&rdma->sc_frmr_q_lock);
+               spin_unlock(&rdma->sc_frmr_q_lock);
        }
 }
 
@@ -1002,6 +1008,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
        newxprt->sc_max_req_size = svcrdma_max_req_size;
        newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr,
                                         svcrdma_max_requests);
+       newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests);
        newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr,
                                            svcrdma_max_bc_requests);
        newxprt->sc_rq_depth = newxprt->sc_max_requests +
@@ -1027,13 +1034,13 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
                goto errout;
        }
        newxprt->sc_sq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_sq_depth,
-                                       0, IB_POLL_SOFTIRQ);
+                                       0, IB_POLL_WORKQUEUE);
        if (IS_ERR(newxprt->sc_sq_cq)) {
                dprintk("svcrdma: error creating SQ CQ for connect request\n");
                goto errout;
        }
        newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_rq_depth,
-                                       0, IB_POLL_SOFTIRQ);
+                                       0, IB_POLL_WORKQUEUE);
        if (IS_ERR(newxprt->sc_rq_cq)) {
                dprintk("svcrdma: error creating RQ CQ for connect request\n");
                goto errout;
@@ -1213,20 +1220,18 @@ static void __svc_rdma_free(struct work_struct *work)
         */
        while (!list_empty(&rdma->sc_read_complete_q)) {
                struct svc_rdma_op_ctxt *ctxt;
-               ctxt = list_entry(rdma->sc_read_complete_q.next,
-                                 struct svc_rdma_op_ctxt,
-                                 dto_q);
-               list_del_init(&ctxt->dto_q);
+               ctxt = list_first_entry(&rdma->sc_read_complete_q,
+                                       struct svc_rdma_op_ctxt, list);
+               list_del(&ctxt->list);
                svc_rdma_put_context(ctxt, 1);
        }
 
        /* Destroy queued, but not processed recv completions */
        while (!list_empty(&rdma->sc_rq_dto_q)) {
                struct svc_rdma_op_ctxt *ctxt;
-               ctxt = list_entry(rdma->sc_rq_dto_q.next,
-                                 struct svc_rdma_op_ctxt,
-                                 dto_q);
-               list_del_init(&ctxt->dto_q);
+               ctxt = list_first_entry(&rdma->sc_rq_dto_q,
+                                       struct svc_rdma_op_ctxt, list);
+               list_del(&ctxt->list);
                svc_rdma_put_context(ctxt, 1);
        }
 
index e9295fa3a554c860120f8e22a96f16c9f7e74745..4512e83652b16da259db9327fe0958c3642e70f1 100644 (file)
@@ -1505,19 +1505,21 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b)
 {
        struct sk_buff_head xmitq;
        struct tipc_node *n;
-       struct tipc_msg *hdr = buf_msg(skb);
-       int usr = msg_user(hdr);
+       struct tipc_msg *hdr;
        int bearer_id = b->identity;
        struct tipc_link_entry *le;
-       u16 bc_ack = msg_bcast_ack(hdr);
        u32 self = tipc_own_addr(net);
-       int rc = 0;
+       int usr, rc = 0;
+       u16 bc_ack;
 
        __skb_queue_head_init(&xmitq);
 
-       /* Ensure message is well-formed */
+       /* Ensure message is well-formed before touching the header */
        if (unlikely(!tipc_msg_validate(skb)))
                goto discard;
+       hdr = buf_msg(skb);
+       usr = msg_user(hdr);
+       bc_ack = msg_bcast_ack(hdr);
 
        /* Handle arrival of discovery or broadcast packet */
        if (unlikely(msg_non_seq(hdr))) {
index 5f3e87866438f5320aaa1030329ff5ab78a8ba13..0806dccdf5078451e0dd9c5b5573d040ab21c831 100644 (file)
@@ -2836,14 +2836,8 @@ static unsigned int xfrm_mtu(const struct dst_entry *dst)
        return mtu ? : dst_mtu(dst->path);
 }
 
-static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
-                                          struct sk_buff *skb,
-                                          const void *daddr)
-{
-       return dst->path->ops->neigh_lookup(dst, skb, daddr);
-}
-
-static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr)
+static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst,
+                                       const void *daddr)
 {
        const struct dst_entry *path = dst->path;
 
@@ -2857,6 +2851,25 @@ static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr)
                else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR))
                        daddr = &xfrm->id.daddr;
        }
+       return daddr;
+}
+
+static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
+                                          struct sk_buff *skb,
+                                          const void *daddr)
+{
+       const struct dst_entry *path = dst->path;
+
+       if (!skb)
+               daddr = xfrm_get_dst_nexthop(dst, daddr);
+       return path->ops->neigh_lookup(path, skb, daddr);
+}
+
+static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr)
+{
+       const struct dst_entry *path = dst->path;
+
+       daddr = xfrm_get_dst_nexthop(dst, daddr);
        path->ops->confirm_neigh(path, daddr);
 }
 
index aaf7ed329a453a87b91a38f73b6cc25c27764c82..477f00eda59184ce9e844310fc04027706838cae 100644 (file)
@@ -35,8 +35,8 @@ all: $(OUTPUT)fixdep
 
 clean:
        $(call QUIET_CLEAN, fixdep)
-       $(Q)find . -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
-       $(Q)rm -f fixdep
+       $(Q)find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
+       $(Q)rm -f $(OUTPUT)fixdep
 
 $(OUTPUT)fixdep-in.o: FORCE
        $(Q)$(MAKE) $(build)=fixdep
index ad22e4e7bc593960dcd2eab055b1658227b7bf56..d360f39a445b0ef8edde51a4b2171b437b09d368 100644 (file)
@@ -3,4 +3,7 @@ build := -f $(srctree)/tools/build/Makefile.build dir=. obj
 fixdep:
        $(Q)$(MAKE) -C $(srctree)/tools/build CFLAGS= LDFLAGS= $(OUTPUT)fixdep
 
+fixdep-clean:
+       $(Q)$(MAKE) -C $(srctree)/tools/build clean
+
 .PHONY: fixdep
index f2ea78021450a53390b6754a78d25851317a1d2a..7ce724fc054478c923605385fd00d8471de831e6 100644 (file)
@@ -5225,13 +5225,13 @@ int pevent_data_pid(struct pevent *pevent, struct pevent_record *rec)
 }
 
 /**
- * pevent_data_prempt_count - parse the preempt count from the record
+ * pevent_data_preempt_count - parse the preempt count from the record
  * @pevent: a handle to the pevent
  * @rec: the record to parse
  *
  * This returns the preempt count from a record.
  */
-int pevent_data_prempt_count(struct pevent *pevent, struct pevent_record *rec)
+int pevent_data_preempt_count(struct pevent *pevent, struct pevent_record *rec)
 {
        return parse_common_pc(pevent, rec->data);
 }
index 74cecba87daaa654504a7c66154b70490c23e8bd..66342804161c80ea611b3dfa554a602fadc4213e 100644 (file)
@@ -710,7 +710,7 @@ void pevent_data_lat_fmt(struct pevent *pevent,
 int pevent_data_type(struct pevent *pevent, struct pevent_record *rec);
 struct event_format *pevent_data_event_from_type(struct pevent *pevent, int type);
 int pevent_data_pid(struct pevent *pevent, struct pevent_record *rec);
-int pevent_data_prempt_count(struct pevent *pevent, struct pevent_record *rec);
+int pevent_data_preempt_count(struct pevent *pevent, struct pevent_record *rec);
 int pevent_data_flags(struct pevent *pevent, struct pevent_record *rec);
 const char *pevent_data_comm_from_pid(struct pevent *pevent, int pid);
 struct cmdline;
index f7350fcedc70dc6c0ef04159d9fa6e33cc76bcdc..a59e061c0b4a0abcf3e9fc7a76e462fd2c41954b 100644 (file)
@@ -31,9 +31,8 @@
 #define INSN_CALL_DYNAMIC      8
 #define INSN_RETURN            9
 #define INSN_CONTEXT_SWITCH    10
-#define INSN_BUG               11
-#define INSN_NOP               12
-#define INSN_OTHER             13
+#define INSN_NOP               11
+#define INSN_OTHER             12
 #define INSN_LAST              INSN_OTHER
 
 int arch_decode_instruction(struct elf *elf, struct section *sec,
index 039636ffb6c8a3edb6c14fd9a2b3a854ab84f982..6ac99e3266eb8218aa485081501e9e0da17c4c77 100644 (file)
@@ -118,9 +118,6 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
                         op2 == 0x35)
                        /* sysenter, sysret */
                        *type = INSN_CONTEXT_SWITCH;
-               else if (op2 == 0x0b || op2 == 0xb9)
-                       /* ud2 */
-                       *type = INSN_BUG;
                else if (op2 == 0x0d || op2 == 0x1f)
                        /* nopl/nopw */
                        *type = INSN_NOP;
index e8a1f699058a29ba695bfbf24781562c665e4525..5fc52ee3264c1ad9b191c454c404b5df21486beb 100644 (file)
@@ -51,7 +51,7 @@ struct instruction {
        unsigned int len, state;
        unsigned char type;
        unsigned long immediate;
-       bool alt_group, visited;
+       bool alt_group, visited, dead_end;
        struct symbol *call_dest;
        struct instruction *jump_dest;
        struct list_head alts;
@@ -329,6 +329,54 @@ static int decode_instructions(struct objtool_file *file)
        return 0;
 }
 
+/*
+ * Find all uses of the unreachable() macro, which are code path dead ends.
+ */
+static int add_dead_ends(struct objtool_file *file)
+{
+       struct section *sec;
+       struct rela *rela;
+       struct instruction *insn;
+       bool found;
+
+       sec = find_section_by_name(file->elf, ".rela__unreachable");
+       if (!sec)
+               return 0;
+
+       list_for_each_entry(rela, &sec->rela_list, list) {
+               if (rela->sym->type != STT_SECTION) {
+                       WARN("unexpected relocation symbol type in .rela__unreachable");
+                       return -1;
+               }
+               insn = find_insn(file, rela->sym->sec, rela->addend);
+               if (insn)
+                       insn = list_prev_entry(insn, list);
+               else if (rela->addend == rela->sym->sec->len) {
+                       found = false;
+                       list_for_each_entry_reverse(insn, &file->insn_list, list) {
+                               if (insn->sec == rela->sym->sec) {
+                                       found = true;
+                                       break;
+                               }
+                       }
+
+                       if (!found) {
+                               WARN("can't find unreachable insn at %s+0x%x",
+                                    rela->sym->sec->name, rela->addend);
+                               return -1;
+                       }
+               } else {
+                       WARN("can't find unreachable insn at %s+0x%x",
+                            rela->sym->sec->name, rela->addend);
+                       return -1;
+               }
+
+               insn->dead_end = true;
+       }
+
+       return 0;
+}
+
 /*
  * Warnings shouldn't be reported for ignored functions.
  */
@@ -843,6 +891,10 @@ static int decode_sections(struct objtool_file *file)
        if (ret)
                return ret;
 
+       ret = add_dead_ends(file);
+       if (ret)
+               return ret;
+
        add_ignores(file);
 
        ret = add_jump_destinations(file);
@@ -1037,13 +1089,13 @@ static int validate_branch(struct objtool_file *file,
 
                        return 0;
 
-               case INSN_BUG:
-                       return 0;
-
                default:
                        break;
                }
 
+               if (insn->dead_end)
+                       return 0;
+
                insn = next_insn_same_sec(file, insn);
                if (!insn) {
                        WARN("%s: unexpected end of section", sec->name);
index 8ffbd272952d2e57d21451408579c684a551aa65..a89273d8e74417b8d7feb4b96e7fab6a52513339 100644 (file)
@@ -39,6 +39,10 @@ OPTIONS
 --verbose::
         Be more verbose. (Show symbol address, etc)
 
+-q::
+--quiet::
+       Do not show any message.  (Suppress -v)
+
 -D::
 --dump-raw-trace::
         Dump raw trace in ASCII.
index 66dbe3dee74bcaae5b523bf3f80741b8608c87e2..a79c84ae61aaf616b931385c31969911d1d530bd 100644 (file)
@@ -73,6 +73,10 @@ OPTIONS
        Be verbose, for instance, show the raw counts in addition to the
        diff.
 
+-q::
+--quiet::
+       Do not show any message.  (Suppress -v)
+
 -f::
 --force::
         Don't do ownership validation.
index 27256bc68eda0268fc79d9d9eca06bb349a8212e..b16003ec14a743bcdcc8473798388b0849aa3523 100644 (file)
@@ -157,7 +157,7 @@ OPTIONS
 
 -a::
 --all-cpus::
-        System-wide collection from all CPUs.
+        System-wide collection from all CPUs (default if no target is specified).
 
 -p::
 --pid=::
index f2914f03ae7bb83f11a49ab194b224566e676967..c04cc0647c16e6d8bbb458d655ab8ddbec3262c9 100644 (file)
@@ -25,6 +25,10 @@ OPTIONS
 --verbose::
         Be more verbose. (show symbol address, etc)
 
+-q::
+--quiet::
+       Do not show any message.  (Suppress -v)
+
 -n::
 --show-nr-samples::
        Show the number of samples for each symbol
index d96ccd4844df9a49f33b05c6c0384b5f8e6eef05..aecf2a87e7d60bf4a759f47d7c60aac1b12aec07 100644 (file)
@@ -63,7 +63,7 @@ report::
 
 -a::
 --all-cpus::
-        system-wide collection from all CPUs
+        system-wide collection from all CPUs (default if no target is specified)
 
 -c::
 --scale::
index 2b941efadb04e34779f00c4b6348262da0fb3dd6..27c9fbca7bd9c79eb703ad2d37d4280f9d286cc4 100644 (file)
@@ -175,6 +175,10 @@ PYTHON_CONFIG_SQ := $(call shell-sq,$(PYTHON_CONFIG))
 PYTHON_EMBED_LDOPTS := $(shell $(PYTHON_CONFIG_SQ) --ldflags 2>/dev/null)
 PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null)
 
+ifeq ($(CC), clang)
+  PYTHON_EMBED_CCOPTS := $(filter-out -specs=%,$(PYTHON_EMBED_CCOPTS))
+endif
+
 FEATURE_CHECK_CFLAGS-libpython := $(PYTHON_EMBED_CCOPTS)
 FEATURE_CHECK_LDFLAGS-libpython := $(PYTHON_EMBED_LDOPTS)
 FEATURE_CHECK_CFLAGS-libpython-version := $(PYTHON_EMBED_CCOPTS)
@@ -601,6 +605,9 @@ else
       PYTHON_EMBED_LDFLAGS := $(call strip-libs,$(PYTHON_EMBED_LDOPTS))
       PYTHON_EMBED_LIBADD := $(call grep-libs,$(PYTHON_EMBED_LDOPTS)) -lutil
       PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null)
+      ifeq ($(CC), clang)
+        PYTHON_EMBED_CCOPTS := $(filter-out -specs=%,$(PYTHON_EMBED_CCOPTS))
+      endif
       FLAGS_PYTHON_EMBED := $(PYTHON_EMBED_CCOPTS) $(PYTHON_EMBED_LDOPTS)
 
       ifneq ($(feature-libpython), 1)
index 4da19b6ba94acd03c9764733f84d089def765dbf..79fe31f20a17644e416642bb9d3a213c97286479 100644 (file)
@@ -726,13 +726,13 @@ config-clean:
        $(call QUIET_CLEAN, config)
        $(Q)$(MAKE) -C $(srctree)/tools/build/feature/ $(if $(OUTPUT),OUTPUT=$(OUTPUT)feature/,) clean >/dev/null
 
-clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean config-clean
+clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean config-clean fixdep-clean
        $(call QUIET_CLEAN, core-objs)  $(RM) $(LIB_FILE) $(OUTPUT)perf-archive $(OUTPUT)perf-with-kcore $(LANG_BINDINGS)
        $(Q)find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
        $(Q)$(RM) $(OUTPUT).config-detected
        $(call QUIET_CLEAN, core-progs) $(RM) $(ALL_PROGRAMS) perf perf-read-vdso32 perf-read-vdsox32 $(OUTPUT)pmu-events/jevents $(OUTPUT)$(LIBJVMTI).so
        $(call QUIET_CLEAN, core-gen)   $(RM)  *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope* $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)FEATURE-DUMP $(OUTPUT)util/*-bison* $(OUTPUT)util/*-flex* \
-               $(OUTPUT)util/intel-pt-decoder/inat-tables.c $(OUTPUT)fixdep \
+               $(OUTPUT)util/intel-pt-decoder/inat-tables.c \
                $(OUTPUT)tests/llvm-src-{base,kbuild,prologue,relocation}.c \
                $(OUTPUT)pmu-events/pmu-events.c
        $(QUIET_SUBDIR0)Documentation $(QUIET_SUBDIR1) clean
index ebb628332a6e59a938347eca53c3da81c859aee5..4f52d85f5ebc574daa91f29b1a3d24758c3d276d 100644 (file)
@@ -410,6 +410,7 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __maybe_unused)
        OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"),
        OPT_INCR('v', "verbose", &verbose,
                    "be more verbose (show symbol address, etc)"),
+       OPT_BOOLEAN('q', "quiet", &quiet, "do now show any message"),
        OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
                    "dump raw trace in ASCII"),
        OPT_BOOLEAN(0, "gtk", &annotate.use_gtk, "Use the GTK interface"),
@@ -463,6 +464,9 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __maybe_unused)
                annotate.sym_hist_filter = argv[0];
        }
 
+       if (quiet)
+               perf_quiet_option();
+
        file.path  = input_name;
 
        annotate.session = perf_session__new(&file, false, &annotate.tool);
index 70a2893475912e3eee09f5b8787136329dec2116..1b96a3122228f913af671d315ad0a71cedb6c11d 100644 (file)
@@ -691,7 +691,7 @@ static void hists__process(struct hists *hists)
        hists__precompute(hists);
        hists__output_resort(hists, NULL);
 
-       hists__fprintf(hists, true, 0, 0, 0, stdout,
+       hists__fprintf(hists, !quiet, 0, 0, 0, stdout,
                       symbol_conf.use_callchain);
 }
 
@@ -739,12 +739,14 @@ static void data_process(void)
                                hists__link(hists_base, hists);
                }
 
-               fprintf(stdout, "%s# Event '%s'\n#\n", first ? "" : "\n",
-                       perf_evsel__name(evsel_base));
+               if (!quiet) {
+                       fprintf(stdout, "%s# Event '%s'\n#\n", first ? "" : "\n",
+                               perf_evsel__name(evsel_base));
+               }
 
                first = false;
 
-               if (verbose || data__files_cnt > 2)
+               if (verbose > 0 || ((data__files_cnt > 2) && !quiet))
                        data__fprintf();
 
                /* Don't sort callchain for perf diff */
@@ -807,6 +809,7 @@ static const char * const diff_usage[] = {
 static const struct option options[] = {
        OPT_INCR('v', "verbose", &verbose,
                    "be more verbose (show symbol address, etc)"),
+       OPT_BOOLEAN('q', "quiet", &quiet, "Do not show any message"),
        OPT_BOOLEAN('b', "baseline-only", &show_baseline_only,
                    "Show only items with match in baseline"),
        OPT_CALLBACK('c', "compute", &compute,
@@ -1328,6 +1331,9 @@ int cmd_diff(int argc, const char **argv, const char *prefix __maybe_unused)
 
        argc = parse_options(argc, argv, options, diff_usage, 0);
 
+       if (quiet)
+               perf_quiet_option();
+
        if (symbol__init(NULL) < 0)
                return -1;
 
index cd7bc4d104e27e878e1ed694bf6be1d9b89d0a9d..6114e07ca6131ca94ed1a6107fb69ae3ccbab145 100644 (file)
@@ -42,8 +42,8 @@ static int parse_record_events(const struct option *opt,
 
                fprintf(stderr, "%-13s%-*s%s\n",
                        e->tag,
-                       verbose ? 25 : 0,
-                       verbose ? perf_mem_events__name(j) : "",
+                       verbose > 0 ? 25 : 0,
+                       verbose > 0 ? perf_mem_events__name(j) : "",
                        e->supported ? ": available" : "");
        }
        exit(0);
index 6cd6776052e7a940f78c78d43e623709256a0711..bc84a375295d7cb4920df6a4be1f91403bcdc04b 100644 (file)
@@ -432,7 +432,7 @@ static int record__open(struct record *rec)
 try_again:
                if (perf_evsel__open(pos, pos->cpus, pos->threads) < 0) {
                        if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
-                               if (verbose)
+                               if (verbose > 0)
                                        ui__warning("%s\n", msg);
                                goto try_again;
                        }
@@ -1677,8 +1677,12 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
 
        argc = parse_options(argc, argv, record_options, record_usage,
                            PARSE_OPT_STOP_AT_NON_OPTION);
+       if (quiet)
+               perf_quiet_option();
+
+       /* Make system wide (-a) the default target. */
        if (!argc && target__none(&rec->opts.target))
-               usage_with_options(record_usage, record_options);
+               rec->opts.target.system_wide = true;
 
        if (nr_cgroups && !rec->opts.target.system_wide) {
                usage_with_options_msg(record_usage, record_options,
index dbd7fa0288616e3c29003d9de62e8a62bf068051..0a88670e56f35f6d8c5397e80264001acaf35e94 100644 (file)
@@ -320,6 +320,9 @@ static size_t hists__fprintf_nr_sample_events(struct hists *hists, struct report
        size_t size = sizeof(buf);
        int socked_id = hists->socket_filter;
 
+       if (quiet)
+               return 0;
+
        if (symbol_conf.filter_relative) {
                nr_samples = hists->stats.nr_non_filtered_samples;
                nr_events = hists->stats.total_non_filtered_period;
@@ -372,7 +375,11 @@ static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist,
 {
        struct perf_evsel *pos;
 
-       fprintf(stdout, "#\n# Total Lost Samples: %" PRIu64 "\n#\n", evlist->stats.total_lost_samples);
+       if (!quiet) {
+               fprintf(stdout, "#\n# Total Lost Samples: %" PRIu64 "\n#\n",
+                       evlist->stats.total_lost_samples);
+       }
+
        evlist__for_each_entry(evlist, pos) {
                struct hists *hists = evsel__hists(pos);
                const char *evname = perf_evsel__name(pos);
@@ -382,7 +389,7 @@ static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist,
                        continue;
 
                hists__fprintf_nr_sample_events(hists, rep, evname, stdout);
-               hists__fprintf(hists, true, 0, 0, rep->min_percent, stdout,
+               hists__fprintf(hists, !quiet, 0, 0, rep->min_percent, stdout,
                               symbol_conf.use_callchain);
                fprintf(stdout, "\n\n");
        }
@@ -716,6 +723,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
                    "input file name"),
        OPT_INCR('v', "verbose", &verbose,
                    "be more verbose (show symbol address, etc)"),
+       OPT_BOOLEAN('q', "quiet", &quiet, "Do not show any message"),
        OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
                    "dump raw trace in ASCII"),
        OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
@@ -863,6 +871,9 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
                report.symbol_filter_str = argv[0];
        }
 
+       if (quiet)
+               perf_quiet_option();
+
        if (symbol_conf.vmlinux_name &&
            access(symbol_conf.vmlinux_name, R_OK)) {
                pr_err("Invalid file: %s\n", symbol_conf.vmlinux_name);
@@ -983,14 +994,14 @@ repeat:
                goto error;
        }
 
-       if (report.header || report.header_only) {
+       if ((report.header || report.header_only) && !quiet) {
                perf_session__fprintf_info(session, stdout,
                                           report.show_full_info);
                if (report.header_only) {
                        ret = 0;
                        goto error;
                }
-       } else if (use_browser == 0) {
+       } else if (use_browser == 0 && !quiet) {
                fputs("# To display the perf.data header info, please use --header/--header-only options.\n#\n",
                      stdout);
        }
@@ -1009,7 +1020,7 @@ repeat:
                 * providing it only in verbose mode not to bloat too
                 * much struct symbol.
                 */
-               if (verbose) {
+               if (verbose > 0) {
                        /*
                         * XXX: Need to provide a less kludgy way to ask for
                         * more space per symbol, the u32 is for the index on
index 270eb2d8ca6b24bb6b7c74ff0b068417e655d8e7..b94cf0de715ab9a2d6205c12053916c31d276a13 100644 (file)
@@ -460,7 +460,7 @@ static struct task_desc *register_pid(struct perf_sched *sched,
        BUG_ON(!sched->tasks);
        sched->tasks[task->nr] = task;
 
-       if (verbose)
+       if (verbose > 0)
                printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm);
 
        return task;
@@ -794,7 +794,7 @@ replay_wakeup_event(struct perf_sched *sched,
        const u32 pid    = perf_evsel__intval(evsel, sample, "pid");
        struct task_desc *waker, *wakee;
 
-       if (verbose) {
+       if (verbose > 0) {
                printf("sched_wakeup event %p\n", evsel);
 
                printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid);
@@ -822,7 +822,7 @@ static int replay_switch_event(struct perf_sched *sched,
        int cpu = sample->cpu;
        s64 delta;
 
-       if (verbose)
+       if (verbose > 0)
                printf("sched_switch event %p\n", evsel);
 
        if (cpu >= MAX_CPUS || cpu < 0)
@@ -870,7 +870,7 @@ static int replay_fork_event(struct perf_sched *sched,
                goto out_put;
        }
 
-       if (verbose) {
+       if (verbose > 0) {
                printf("fork event\n");
                printf("... parent: %s/%d\n", thread__comm_str(parent), parent->tid);
                printf("...  child: %s/%d\n", thread__comm_str(child), child->tid);
@@ -1573,7 +1573,7 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
 
        timestamp__scnprintf_usec(timestamp, stimestamp, sizeof(stimestamp));
        color_fprintf(stdout, color, "  %12s secs ", stimestamp);
-       if (new_shortname || (verbose && sched_in->tid)) {
+       if (new_shortname || (verbose > 0 && sched_in->tid)) {
                const char *pid_color = color;
 
                if (thread__has_color(sched_in))
@@ -2050,7 +2050,7 @@ static void save_task_callchain(struct perf_sched *sched,
 
        if (thread__resolve_callchain(thread, cursor, evsel, sample,
                                      NULL, NULL, sched->max_stack + 2) != 0) {
-               if (verbose)
+               if (verbose > 0)
                        error("Failed to resolve callchain. Skipping\n");
 
                return;
index f28719178b519b92be214b2fe0405205e6776652..13b54999ad79ecd4f765d557bf57764486f1fac8 100644 (file)
@@ -573,7 +573,7 @@ try_again:
                        if (errno == EINVAL || errno == ENOSYS ||
                            errno == ENOENT || errno == EOPNOTSUPP ||
                            errno == ENXIO) {
-                               if (verbose)
+                               if (verbose > 0)
                                        ui__warning("%s event is not supported by the kernel.\n",
                                                    perf_evsel__name(counter));
                                counter->supported = false;
@@ -582,7 +582,7 @@ try_again:
                                    !(counter->leader->nr_members > 1))
                                        continue;
                        } else if (perf_evsel__fallback(counter, errno, msg, sizeof(msg))) {
-                                if (verbose)
+                                if (verbose > 0)
                                         ui__warning("%s\n", msg);
                                 goto try_again;
                         }
@@ -1765,7 +1765,7 @@ static inline int perf_env__get_cpu(struct perf_env *env, struct cpu_map *map, i
 
        cpu = map->map[idx];
 
-       if (cpu >= env->nr_cpus_online)
+       if (cpu >= env->nr_cpus_avail)
                return -1;
 
        return cpu;
@@ -2445,8 +2445,9 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
        } else if (big_num_opt == 0) /* User passed --no-big-num */
                big_num = false;
 
+       /* Make system wide (-a) the default target. */
        if (!argc && target__none(&target))
-               usage_with_options(stat_usage, stat_options);
+               target.system_wide = true;
 
        if (run_count < 0) {
                pr_err("Run count must be a positive number\n");
@@ -2538,7 +2539,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 
        status = 0;
        for (run_idx = 0; forever || run_idx < run_count; run_idx++) {
-               if (run_count != 1 && verbose)
+               if (run_count != 1 && verbose > 0)
                        fprintf(output, "[ perf stat: executing run #%d ... ]\n",
                                run_idx + 1);
 
index 5a7fd7af3a6de39d4a0d5c5ed5a758c92aeb88f3..ab9077915763f19a71b1b36a8b50135e6a0ffbb3 100644 (file)
@@ -871,7 +871,7 @@ try_again:
                if (perf_evsel__open(counter, top->evlist->cpus,
                                     top->evlist->threads) < 0) {
                        if (perf_evsel__fallback(counter, errno, msg, sizeof(msg))) {
-                               if (verbose)
+                               if (verbose > 0)
                                        ui__warning("%s\n", msg);
                                goto try_again;
                        }
index 40ef9b293d1b4ffa0213108ca3ec391621c1267d..256f1fac6f7e0069ebb047cf080fa55999dd0ae1 100644 (file)
@@ -1399,7 +1399,7 @@ static struct syscall *trace__syscall_info(struct trace *trace,
        return &trace->syscalls.table[id];
 
 out_cant_read:
-       if (verbose) {
+       if (verbose > 0) {
                fprintf(trace->output, "Problems reading syscall %d", id);
                if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
                        fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
@@ -1801,10 +1801,10 @@ static void print_location(FILE *f, struct perf_sample *sample,
                           bool print_dso, bool print_sym)
 {
 
-       if ((verbose || print_dso) && al->map)
+       if ((verbose > 0 || print_dso) && al->map)
                fprintf(f, "%s@", al->map->dso->long_name);
 
-       if ((verbose || print_sym) && al->sym)
+       if ((verbose > 0 || print_sym) && al->sym)
                fprintf(f, "%s+0x%" PRIx64, al->sym->name,
                        al->addr - al->sym->start);
        else if (al->map)
index f67bbb0aa36e572c46b5847736cc3d3efa188f8f..0544398d6e2dd599e92a3771a72a52138a72d2f7 100644 (file)
@@ -49,7 +49,7 @@ static char *mapfile(const char *fn, size_t *size)
        int err;
        int fd = open(fn, O_RDONLY);
 
-       if (fd < 0 && verbose && fn) {
+       if (fd < 0 && verbose > 0 && fn) {
                pr_err("Error opening events file '%s': %s\n", fn,
                                strerror(errno));
        }
index 28d1605b033896aa4b87844eeffa6d9dc1646e6b..88dc51f4c27b2df8c5d8dc482a189b9831cb2832 100644 (file)
@@ -144,7 +144,7 @@ static int run_dir(const char *d, const char *perf)
        int vcnt = min(verbose, (int) sizeof(v) - 1);
        char cmd[3*PATH_MAX];
 
-       if (verbose)
+       if (verbose > 0)
                vcnt++;
 
        snprintf(cmd, 3*PATH_MAX, PYTHON " %s/attr.py -d %s/attr/ -p %s %.*s",
index 37e326bfd2dc3a273032eeac68de3c8d4383104f..83c4669cbc5b9e30576321026dd38d68aa117fde 100644 (file)
@@ -299,7 +299,7 @@ static int run_test(struct test *test, int subtest)
                if (!dont_fork) {
                        pr_debug("test child forked, pid %d\n", getpid());
 
-                       if (!verbose) {
+                       if (verbose <= 0) {
                                int nullfd = open("/dev/null", O_WRONLY);
 
                                if (nullfd >= 0) {
index ff5bc6363a79de05084aca11ec856f468ce9903d..d1f693041324a8a6670ff56081537c3ed3f528f0 100644 (file)
@@ -599,7 +599,7 @@ static int do_test_code_reading(bool try_kcore)
                                continue;
                        }
 
-                       if (verbose) {
+                       if (verbose > 0) {
                                char errbuf[512];
                                perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
                                pr_debug("perf_evlist__open() failed!\n%s\n", errbuf);
index a2b5ff9bf83d615b67d97a00ec38e04e3f8f6291..bc5982f42dc3a173d81e5100172578ac4bdb280b 100644 (file)
@@ -19,7 +19,7 @@ static int fdarray__fprintf_prefix(struct fdarray *fda, const char *prefix, FILE
 {
        int printed = 0;
 
-       if (!verbose)
+       if (verbose <= 0)
                return 0;
 
        printed += fprintf(fp, "\n%s: ", prefix);
index d357dab72e68862e90d916092755a12f535ec1a4..482b5365e68d85b3ea4915631f398873e575d72b 100644 (file)
@@ -76,7 +76,7 @@ test_llvm__fetch_bpf_obj(void **p_obj_buf,
         * Skip this test if user's .perfconfig doesn't set [llvm] section
         * and clang is not found in $PATH, and this is not perf test -v
         */
-       if (!force && (verbose == 0 &&
+       if (!force && (verbose <= 0 &&
                       !llvm_param.user_set_param &&
                       llvm__search_clang())) {
                pr_debug("No clang and no verbosive, skip this test\n");
index aa9276bfe3e9b7b6ed1def3889142c645c1e8e30..1dc8380144220bd2c37f5c6338ad9a38e1c641bf 100644 (file)
@@ -1808,7 +1808,7 @@ static void debug_warn(const char *warn, va_list params)
 {
        char msg[1024];
 
-       if (!verbose)
+       if (verbose <= 0)
                return;
 
        vsnprintf(msg, sizeof(msg), warn, params);
index 541da7a68f91fc4631c0c20f313a9b58f720771b..87893f3ba5f1766cffa093ce44db50de45526279 100644 (file)
@@ -172,13 +172,13 @@ int test__PERF_RECORD(int subtest __maybe_unused)
 
                                err = perf_evlist__parse_sample(evlist, event, &sample);
                                if (err < 0) {
-                                       if (verbose)
+                                       if (verbose > 0)
                                                perf_event__fprintf(event, stderr);
                                        pr_debug("Couldn't parse sample\n");
                                        goto out_delete_evlist;
                                }
 
-                               if (verbose) {
+                               if (verbose > 0) {
                                        pr_info("%" PRIu64" %d ", sample.time, sample.cpu);
                                        perf_event__fprintf(event, stderr);
                                }
index 7a52834ee0d0e848802b104719ee7726e3b97aae..fa79509da535403888c41cfa33a3ca3ff89033fd 100644 (file)
@@ -15,7 +15,7 @@ int test__python_use(int subtest __maybe_unused)
        int ret;
 
        if (asprintf(&cmd, "echo \"import sys ; sys.path.append('%s'); import perf\" | %s %s",
-                    PYTHONPATH, PYTHON, verbose ? "" : "2> /dev/null") < 0)
+                    PYTHONPATH, PYTHON, verbose > 0 ? "" : "2> /dev/null") < 0)
                return -1;
 
        ret = system(cmd) ? -1 : 0;
index a4a4b4625ac3d8864531b781c747945b53f1f849..f2d2e542d0ee77a8abf7bfc6074f0934da85b5c4 100644 (file)
@@ -109,7 +109,7 @@ int test__thread_map_remove(int subtest __maybe_unused)
        TEST_ASSERT_VAL("failed to allocate thread_map",
                        threads);
 
-       if (verbose)
+       if (verbose > 0)
                thread_map__fprintf(threads, stderr);
 
        TEST_ASSERT_VAL("failed to remove thread",
@@ -117,7 +117,7 @@ int test__thread_map_remove(int subtest __maybe_unused)
 
        TEST_ASSERT_VAL("thread_map count != 1", threads->nr == 1);
 
-       if (verbose)
+       if (verbose > 0)
                thread_map__fprintf(threads, stderr);
 
        TEST_ASSERT_VAL("failed to remove thread",
@@ -125,7 +125,7 @@ int test__thread_map_remove(int subtest __maybe_unused)
 
        TEST_ASSERT_VAL("thread_map count != 0", threads->nr == 0);
 
-       if (verbose)
+       if (verbose > 0)
                thread_map__fprintf(threads, stderr);
 
        TEST_ASSERT_VAL("failed to not remove thread",
index 98fe69ac553c8462f4fe1dcbf610ff67e3c54948..803f893550d64c03c6354acf88045821cc7fe171 100644 (file)
@@ -65,7 +65,9 @@ static int check_cpu_topology(char *path, struct cpu_map *map)
        session = perf_session__new(&file, false, NULL);
        TEST_ASSERT_VAL("can't get session", session);
 
-       for (i = 0; i < session->header.env.nr_cpus_online; i++) {
+       for (i = 0; i < session->header.env.nr_cpus_avail; i++) {
+               if (!cpu_map__has(map, i))
+                       continue;
                pr_debug("CPU %d, core %d, socket %d\n", i,
                         session->header.env.cpu[i].core_id,
                         session->header.env.cpu[i].socket_id);
index a5082331f2464929ed4ff6d3cae9869c41b1772e..862b043e59243588671c75298b337d8cd4e29c3b 100644 (file)
@@ -168,7 +168,7 @@ next_pair:
                err = -1;
        }
 
-       if (!verbose)
+       if (verbose <= 0)
                goto out;
 
        header_printed = false;
index 98a34664bb7eb16667ee7169129e7e5b5b8022db..9ce142de536d0dcbb4017d2d80a40a9a668775c4 100644 (file)
@@ -73,7 +73,7 @@ static int map_browser__run(struct map_browser *browser)
 
        if (ui_browser__show(&browser->b, browser->map->dso->long_name,
                             "Press ESC to exit, %s / to search",
-                            verbose ? "" : "restart with -v to use") < 0)
+                            verbose > 0 ? "" : "restart with -v to use") < 0)
                return -1;
 
        while (1) {
@@ -81,7 +81,7 @@ static int map_browser__run(struct map_browser *browser)
 
                switch (key) {
                case '/':
-                       if (verbose)
+                       if (verbose > 0)
                                map_browser__search(browser);
                default:
                        break;
@@ -117,7 +117,7 @@ int map__browse(struct map *map)
 
                if (maxaddr < pos->end)
                        maxaddr = pos->end;
-               if (verbose) {
+               if (verbose > 0) {
                        u32 *idx = symbol__browser_index(pos);
                        *idx = mb.b.nr_entries;
                }
index 18cfcdc90356f89f75b328870cc5fcc621b2005f..5d632dca672aef2851778fb50be672e39c99bf50 100644 (file)
@@ -648,7 +648,7 @@ unsigned int hists__sort_list_width(struct hists *hists)
                ret += fmt->width(fmt, &dummy_hpp, hists);
        }
 
-       if (verbose && hists__has(hists, sym)) /* Addr + origin */
+       if (verbose > 0 && hists__has(hists, sym)) /* Addr + origin */
                ret += 3 + BITS_PER_LONG / 4;
 
        return ret;
index 06cc04e5806a2692fffabbc2c038b82380dfcafd..273f21fa32b55999ab1271e6d3cc316c3a257a41 100644 (file)
@@ -1768,7 +1768,7 @@ int symbol__annotate_printf(struct symbol *sym, struct map *map,
        printf("%-*.*s----\n",
               graph_dotted_len, graph_dotted_len, graph_dotted_line);
 
-       if (verbose)
+       if (verbose > 0)
                symbol__annotate_hits(sym, evsel);
 
        list_for_each_entry(pos, &notes->src->source, node) {
index 2c0b52264a468103e9bbd32b8631d76fae585ff4..8c750493911369976d0fa171e74f3b678f62c054 100644 (file)
@@ -9,6 +9,7 @@
 #include "asm/bug.h"
 
 static int max_cpu_num;
+static int max_present_cpu_num;
 static int max_node_num;
 static int *cpunode_map;
 
@@ -442,6 +443,7 @@ static void set_max_cpu_num(void)
 
        /* set up default */
        max_cpu_num = 4096;
+       max_present_cpu_num = 4096;
 
        mnt = sysfs__mountpoint();
        if (!mnt)
@@ -455,6 +457,17 @@ static void set_max_cpu_num(void)
        }
 
        ret = get_max_num(path, &max_cpu_num);
+       if (ret)
+               goto out;
+
+       /* get the highest present cpu number for a sparse allocation */
+       ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt);
+       if (ret == PATH_MAX) {
+               pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX);
+               goto out;
+       }
+
+       ret = get_max_num(path, &max_present_cpu_num);
 
 out:
        if (ret)
@@ -505,6 +518,15 @@ int cpu__max_cpu(void)
        return max_cpu_num;
 }
 
+int cpu__max_present_cpu(void)
+{
+       if (unlikely(!max_present_cpu_num))
+               set_max_cpu_num();
+
+       return max_present_cpu_num;
+}
+
+
 int cpu__get_node(int cpu)
 {
        if (unlikely(cpunode_map == NULL)) {
index 06bd689f598972fa4e57add5487ad62ac685a22f..1a0549af8f5c944b4fc2b8b619164a107dcf613d 100644 (file)
@@ -62,6 +62,7 @@ int cpu__setup_cpunode_map(void);
 
 int cpu__max_node(void);
 int cpu__max_cpu(void);
+int cpu__max_present_cpu(void);
 int cpu__get_node(int cpu);
 
 int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
index c1838b643108bda4d6fc536ae232f20e643a9ce2..03eb81f30d0d0d471fdff09e8779ed02fd73eb36 100644 (file)
@@ -203,11 +203,28 @@ int perf_debug_option(const char *str)
                v = (v < 0) || (v > 10) ? 0 : v;
        }
 
+       if (quiet)
+               v = -1;
+
        *var->ptr = v;
        free(s);
        return 0;
 }
 
+int perf_quiet_option(void)
+{
+       struct debug_variable *var = &debug_variables[0];
+
+       /* disable all debug messages */
+       while (var->name) {
+               *var->ptr = -1;
+               var++;
+       }
+
+       quiet = true;
+       return 0;
+}
+
 #define DEBUG_WRAPPER(__n, __l)                                \
 static int pr_ ## __n ## _wrapper(const char *fmt, ...)        \
 {                                                      \
index d242adc3d5a2ac7217879a37d0f5ed26b344351a..98832f5531d3d0931b8a21560e56330bf1db5cdc 100644 (file)
@@ -54,5 +54,6 @@ int veprintf(int level, int var, const char *fmt, va_list args);
 
 int perf_debug_option(const char *str);
 void perf_debug_setup(void);
+int perf_quiet_option(void);
 
 #endif /* __PERF_DEBUG_H */
index 3abe3373ce90f1e8a8089d253fdc6226bcb182d0..d38b62a700ca126c293756baa83dbc27df61e53a 100644 (file)
@@ -1058,7 +1058,7 @@ int dso__name_len(const struct dso *dso)
 {
        if (!dso)
                return strlen("[unknown]");
-       if (verbose)
+       if (verbose > 0)
                return dso->long_name_len;
 
        return dso->short_name_len;
index bb964e86b09de18bb1eafe3765e5fccb1e87e448..075fc77286bf05feb5cca14fc3825f34d0fdaed1 100644 (file)
@@ -66,7 +66,7 @@ int perf_env__read_cpu_topology_map(struct perf_env *env)
                return 0;
 
        if (env->nr_cpus_avail == 0)
-               env->nr_cpus_avail = sysconf(_SC_NPROCESSORS_CONF);
+               env->nr_cpus_avail = cpu__max_present_cpu();
 
        nr_cpus = env->nr_cpus_avail;
        if (nr_cpus == -1)
index 3d12c16e51034a8591eba97221d20b37603dd40a..05714d548584b30297892854469c584d280f3a8b 100644 (file)
@@ -295,11 +295,7 @@ static int write_nrcpus(int fd, struct perf_header *h __maybe_unused,
        u32 nrc, nra;
        int ret;
 
-       nr = sysconf(_SC_NPROCESSORS_CONF);
-       if (nr < 0)
-               return -1;
-
-       nrc = (u32)(nr & UINT_MAX);
+       nrc = cpu__max_present_cpu();
 
        nr = sysconf(_SC_NPROCESSORS_ONLN);
        if (nr < 0)
@@ -505,24 +501,29 @@ static void free_cpu_topo(struct cpu_topo *tp)
 
 static struct cpu_topo *build_cpu_topology(void)
 {
-       struct cpu_topo *tp;
+       struct cpu_topo *tp = NULL;
        void *addr;
        u32 nr, i;
        size_t sz;
        long ncpus;
        int ret = -1;
+       struct cpu_map *map;
 
-       ncpus = sysconf(_SC_NPROCESSORS_CONF);
-       if (ncpus < 0)
+       ncpus = cpu__max_present_cpu();
+
+       /* build online CPU map */
+       map = cpu_map__new(NULL);
+       if (map == NULL) {
+               pr_debug("failed to get system cpumap\n");
                return NULL;
+       }
 
        nr = (u32)(ncpus & UINT_MAX);
 
        sz = nr * sizeof(char *);
-
        addr = calloc(1, sizeof(*tp) + 2 * sz);
        if (!addr)
-               return NULL;
+               goto out_free;
 
        tp = addr;
        tp->cpu_nr = nr;
@@ -532,10 +533,16 @@ static struct cpu_topo *build_cpu_topology(void)
        tp->thread_siblings = addr;
 
        for (i = 0; i < nr; i++) {
+               if (!cpu_map__has(map, i))
+                       continue;
+
                ret = build_cpu_topo(tp, i);
                if (ret < 0)
                        break;
        }
+
+out_free:
+       cpu_map__put(map);
        if (ret) {
                free_cpu_topo(tp);
                tp = NULL;
@@ -1126,7 +1133,7 @@ static void print_cpu_topology(struct perf_header *ph, int fd __maybe_unused,
 {
        int nr, i;
        char *str;
-       int cpu_nr = ph->env.nr_cpus_online;
+       int cpu_nr = ph->env.nr_cpus_avail;
 
        nr = ph->env.nr_sibling_cores;
        str = ph->env.sibling_cores;
@@ -1781,7 +1788,7 @@ static int process_cpu_topology(struct perf_file_section *section,
        u32 nr, i;
        char *str;
        struct strbuf sb;
-       int cpu_nr = ph->env.nr_cpus_online;
+       int cpu_nr = ph->env.nr_cpus_avail;
        u64 size = 0;
 
        ph->env.cpu = calloc(cpu_nr, sizeof(*ph->env.cpu));
@@ -1862,7 +1869,7 @@ static int process_cpu_topology(struct perf_file_section *section,
                if (ph->needs_swap)
                        nr = bswap_32(nr);
 
-               if (nr > (u32)cpu_nr) {
+               if (nr != (u32)-1 && nr > (u32)cpu_nr) {
                        pr_debug("socket_id number is too big."
                                 "You may need to upgrade the perf tool.\n");
                        goto free_cpu;
index 32c6a939e4cc6879d872574e27b7dab28970cb2c..eaf72a938fb423ed4ba46982c69324d2341839bc 100644 (file)
@@ -69,7 +69,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
         */
        if (h->ms.sym) {
                symlen = h->ms.sym->namelen + 4;
-               if (verbose)
+               if (verbose > 0)
                        symlen += BITS_PER_LONG / 4 + 2 + 3;
                hists__new_col_len(hists, HISTC_SYMBOL, symlen);
        } else {
@@ -93,7 +93,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
        if (h->branch_info) {
                if (h->branch_info->from.sym) {
                        symlen = (int)h->branch_info->from.sym->namelen + 4;
-                       if (verbose)
+                       if (verbose > 0)
                                symlen += BITS_PER_LONG / 4 + 2 + 3;
                        hists__new_col_len(hists, HISTC_SYMBOL_FROM, symlen);
 
@@ -107,7 +107,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
 
                if (h->branch_info->to.sym) {
                        symlen = (int)h->branch_info->to.sym->namelen + 4;
-                       if (verbose)
+                       if (verbose > 0)
                                symlen += BITS_PER_LONG / 4 + 2 + 3;
                        hists__new_col_len(hists, HISTC_SYMBOL_TO, symlen);
 
index 281e44af31e2fb0b2e32928290f3c4cdfd03cccf..67a8aebc67ab492a9936ad56ae9655526f9d9781 100644 (file)
@@ -2318,24 +2318,20 @@ int parse_events__is_hardcoded_term(struct parse_events_term *term)
        return term->type_term != PARSE_EVENTS__TERM_TYPE_USER;
 }
 
-static int new_term(struct parse_events_term **_term, int type_val,
-                   int type_term, char *config,
-                   char *str, u64 num, int err_term, int err_val)
+static int new_term(struct parse_events_term **_term,
+                   struct parse_events_term *temp,
+                   char *str, u64 num)
 {
        struct parse_events_term *term;
 
-       term = zalloc(sizeof(*term));
+       term = malloc(sizeof(*term));
        if (!term)
                return -ENOMEM;
 
+       *term = *temp;
        INIT_LIST_HEAD(&term->list);
-       term->type_val  = type_val;
-       term->type_term = type_term;
-       term->config = config;
-       term->err_term = err_term;
-       term->err_val  = err_val;
 
-       switch (type_val) {
+       switch (term->type_val) {
        case PARSE_EVENTS__TERM_TYPE_NUM:
                term->val.num = num;
                break;
@@ -2353,15 +2349,22 @@ static int new_term(struct parse_events_term **_term, int type_val,
 
 int parse_events_term__num(struct parse_events_term **term,
                           int type_term, char *config, u64 num,
+                          bool no_value,
                           void *loc_term_, void *loc_val_)
 {
        YYLTYPE *loc_term = loc_term_;
        YYLTYPE *loc_val = loc_val_;
 
-       return new_term(term, PARSE_EVENTS__TERM_TYPE_NUM, type_term,
-                       config, NULL, num,
-                       loc_term ? loc_term->first_column : 0,
-                       loc_val ? loc_val->first_column : 0);
+       struct parse_events_term temp = {
+               .type_val  = PARSE_EVENTS__TERM_TYPE_NUM,
+               .type_term = type_term,
+               .config    = config,
+               .no_value  = no_value,
+               .err_term  = loc_term ? loc_term->first_column : 0,
+               .err_val   = loc_val  ? loc_val->first_column  : 0,
+       };
+
+       return new_term(term, &temp, NULL, num);
 }
 
 int parse_events_term__str(struct parse_events_term **term,
@@ -2371,37 +2374,45 @@ int parse_events_term__str(struct parse_events_term **term,
        YYLTYPE *loc_term = loc_term_;
        YYLTYPE *loc_val = loc_val_;
 
-       return new_term(term, PARSE_EVENTS__TERM_TYPE_STR, type_term,
-                       config, str, 0,
-                       loc_term ? loc_term->first_column : 0,
-                       loc_val ? loc_val->first_column : 0);
+       struct parse_events_term temp = {
+               .type_val  = PARSE_EVENTS__TERM_TYPE_STR,
+               .type_term = type_term,
+               .config    = config,
+               .err_term  = loc_term ? loc_term->first_column : 0,
+               .err_val   = loc_val  ? loc_val->first_column  : 0,
+       };
+
+       return new_term(term, &temp, str, 0);
 }
 
 int parse_events_term__sym_hw(struct parse_events_term **term,
                              char *config, unsigned idx)
 {
        struct event_symbol *sym;
+       struct parse_events_term temp = {
+               .type_val  = PARSE_EVENTS__TERM_TYPE_STR,
+               .type_term = PARSE_EVENTS__TERM_TYPE_USER,
+               .config    = config ?: (char *) "event",
+       };
 
        BUG_ON(idx >= PERF_COUNT_HW_MAX);
        sym = &event_symbols_hw[idx];
 
-       if (config)
-               return new_term(term, PARSE_EVENTS__TERM_TYPE_STR,
-                               PARSE_EVENTS__TERM_TYPE_USER, config,
-                               (char *) sym->symbol, 0, 0, 0);
-       else
-               return new_term(term, PARSE_EVENTS__TERM_TYPE_STR,
-                               PARSE_EVENTS__TERM_TYPE_USER,
-                               (char *) "event", (char *) sym->symbol,
-                               0, 0, 0);
+       return new_term(term, &temp, (char *) sym->symbol, 0);
 }
 
 int parse_events_term__clone(struct parse_events_term **new,
                             struct parse_events_term *term)
 {
-       return new_term(new, term->type_val, term->type_term, term->config,
-                       term->val.str, term->val.num,
-                       term->err_term, term->err_val);
+       struct parse_events_term temp = {
+               .type_val  = term->type_val,
+               .type_term = term->type_term,
+               .config    = term->config,
+               .err_term  = term->err_term,
+               .err_val   = term->err_val,
+       };
+
+       return new_term(new, &temp, term->val.str, term->val.num);
 }
 
 void parse_events_terms__purge(struct list_head *terms)
index da246a3ddb69f7700316dedf0b00beb054046f15..1af6a267c21bfd3a437e3bed1f2122f96c4dced7 100644 (file)
@@ -94,6 +94,7 @@ struct parse_events_term {
        int type_term;
        struct list_head list;
        bool used;
+       bool no_value;
 
        /* error string indexes for within parsed string */
        int err_term;
@@ -122,6 +123,7 @@ void parse_events__shrink_config_terms(void);
 int parse_events__is_hardcoded_term(struct parse_events_term *term);
 int parse_events_term__num(struct parse_events_term **term,
                           int type_term, char *config, u64 num,
+                          bool novalue,
                           void *loc_term, void *loc_val);
 int parse_events_term__str(struct parse_events_term **term,
                           int type_term, char *config, char *str,
index a14b47ab3879bd67db8854d495ba2e195c4f9068..30f018ea137096c22d6798e4bcd7723373db26af 100644 (file)
@@ -252,7 +252,7 @@ PE_KERNEL_PMU_EVENT sep_dc
                        if (!strcasecmp(alias->name, $1)) {
                                ALLOC_LIST(head);
                                ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
-                                       $1, 1, &@1, NULL));
+                                       $1, 1, false, &@1, NULL));
                                list_add_tail(&term->list, head);
 
                                if (!parse_events_add_pmu(data, list,
@@ -282,7 +282,7 @@ PE_PMU_EVENT_PRE '-' PE_PMU_EVENT_SUF sep_dc
 
        ALLOC_LIST(head);
        ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
-                                       &pmu_name, 1, &@1, NULL));
+                                       &pmu_name, 1, false, &@1, NULL));
        list_add_tail(&term->list, head);
 
        ALLOC_LIST(list);
@@ -548,7 +548,7 @@ PE_NAME '=' PE_VALUE
        struct parse_events_term *term;
 
        ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
-                                       $1, $3, &@1, &@3));
+                                       $1, $3, false, &@1, &@3));
        $$ = term;
 }
 |
@@ -566,7 +566,7 @@ PE_NAME
        struct parse_events_term *term;
 
        ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
-                                       $1, 1, &@1, NULL));
+                                       $1, 1, true, &@1, NULL));
        $$ = term;
 }
 |
@@ -591,7 +591,7 @@ PE_TERM '=' PE_VALUE
 {
        struct parse_events_term *term;
 
-       ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, $3, &@1, &@3));
+       ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, $3, false, &@1, &@3));
        $$ = term;
 }
 |
@@ -599,7 +599,7 @@ PE_TERM
 {
        struct parse_events_term *term;
 
-       ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, 1, &@1, NULL));
+       ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, 1, true, &@1, NULL));
        $$ = term;
 }
 |
@@ -620,7 +620,7 @@ PE_NAME array '=' PE_VALUE
        struct parse_events_term *term;
 
        ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
-                                       $1, $4, &@1, &@4));
+                                       $1, $4, false, &@1, &@4));
        term->array = $2;
        $$ = term;
 }
index 49bfee0e3d9ed0b483180a74b7bfd96954f4968b..12f84dd2ac5dfddc9f7e124fa25b5da4bb2be021 100644 (file)
@@ -745,7 +745,7 @@ static int pmu_resolve_param_term(struct parse_events_term *term,
                }
        }
 
-       if (verbose)
+       if (verbose > 0)
                printf("Required parameter '%s' not specified\n", term->config);
 
        return -1;
@@ -803,7 +803,7 @@ static int pmu_config_term(struct list_head *formats,
 
        format = pmu_find_format(formats, term->config);
        if (!format) {
-               if (verbose)
+               if (verbose > 0)
                        printf("Invalid event/parameter '%s'\n", term->config);
                if (err) {
                        char *pmu_term = pmu_formats_string(formats);
@@ -834,11 +834,20 @@ static int pmu_config_term(struct list_head *formats,
         * Either directly use a numeric term, or try to translate string terms
         * using event parameters.
         */
-       if (term->type_val == PARSE_EVENTS__TERM_TYPE_NUM)
+       if (term->type_val == PARSE_EVENTS__TERM_TYPE_NUM) {
+               if (term->no_value &&
+                   bitmap_weight(format->bits, PERF_PMU_FORMAT_BITS) > 1) {
+                       if (err) {
+                               err->idx = term->err_val;
+                               err->str = strdup("no value assigned for term");
+                       }
+                       return -EINVAL;
+               }
+
                val = term->val.num;
-       else if (term->type_val == PARSE_EVENTS__TERM_TYPE_STR) {
+       else if (term->type_val == PARSE_EVENTS__TERM_TYPE_STR) {
                if (strcmp(term->val.str, "?")) {
-                       if (verbose) {
+                       if (verbose > 0) {
                                pr_info("Invalid sysfs entry %s=%s\n",
                                                term->config, term->val.str);
                        }
@@ -1223,7 +1232,7 @@ void print_pmu_events(const char *event_glob, bool name_only, bool quiet_flag,
                        printf("%*s", 8, "[");
                        wordwrap(aliases[j].desc, 8, columns, 0);
                        printf("]\n");
-                       if (verbose)
+                       if (verbose > 0)
                                printf("%*s%s/%s/\n", 8, "", aliases[j].pmu, aliases[j].str);
                } else
                        printf("  %-50s [Kernel PMU event]\n", aliases[j].name);
index 35f5b7b7715c39e054591644d88b369bbf35de5e..28fb62c32678483cd6d54a9c88e620022e1f9d08 100644 (file)
@@ -594,7 +594,7 @@ static int find_perf_probe_point_from_dwarf(struct probe_trace_point *tp,
        pr_debug("try to find information at %" PRIx64 " in %s\n", addr,
                 tp->module ? : "kernel");
 
-       dinfo = debuginfo_cache__open(tp->module, verbose == 0);
+       dinfo = debuginfo_cache__open(tp->module, verbose <= 0);
        if (dinfo)
                ret = debuginfo__find_probe_point(dinfo,
                                                 (unsigned long)addr, pp);
index 581e0efd6356839567a9ae5303a3fa6581b8ee3b..783326cfbaa6bfdaeef277dc9545f55213ffa942 100644 (file)
@@ -369,10 +369,10 @@ static PyObject *python_process_callchain(struct perf_sample *sample,
                if (node->map) {
                        struct map *map = node->map;
                        const char *dsoname = "[unknown]";
-                       if (map && map->dso && (map->dso->name || map->dso->long_name)) {
+                       if (map && map->dso) {
                                if (symbol_conf.show_kernel_path && map->dso->long_name)
                                        dsoname = map->dso->long_name;
-                               else if (map->dso->name)
+                               else
                                        dsoname = map->dso->name;
                        }
                        pydict_set_item_string_decref(pyelem, "dso",
index 4cdbc8f5f14dbf0dff51304ffb59812ce7273ea6..1dd617d116b5d844f23c88592f9dac9f061a7ff6 100644 (file)
@@ -932,7 +932,7 @@ static void branch_stack__printf(struct perf_sample *sample)
 
                printf("..... %2"PRIu64": %016" PRIx64 " -> %016" PRIx64 " %hu cycles %s%s%s%s %x\n",
                        i, e->from, e->to,
-                       e->flags.cycles,
+                       (unsigned short)e->flags.cycles,
                        e->flags.mispred ? "M" : " ",
                        e->flags.predicted ? "P" : " ",
                        e->flags.abort ? "A" : " ",
index c8680984d2d6680f56a974a1b54056a5e74263e1..af415febbc46e65fd0bdda5cbf1c93f7fa3f9d76 100644 (file)
@@ -1,8 +1,15 @@
 #!/usr/bin/python2
 
-from distutils.core import setup, Extension
 from os import getenv
 
+cc = getenv("CC")
+if cc == "clang":
+    from _sysconfigdata import build_time_vars
+    from re import sub
+    build_time_vars["CFLAGS"] = sub("-specs=[^ ]+", "", build_time_vars["CFLAGS"])
+
+from distutils.core import setup, Extension
+
 from distutils.command.build_ext   import build_ext   as _build_ext
 from distutils.command.install_lib import install_lib as _install_lib
 
index df622f4e301e2a2284ee5e728b09ade055680796..0ff622288d243c4edad03a904b6da3a52cf4da50 100644 (file)
@@ -151,7 +151,7 @@ static int64_t _sort__dso_cmp(struct map *map_l, struct map *map_r)
        if (!dso_l || !dso_r)
                return cmp_null(dso_r, dso_l);
 
-       if (verbose) {
+       if (verbose > 0) {
                dso_name_l = dso_l->long_name;
                dso_name_r = dso_r->long_name;
        } else {
@@ -172,8 +172,8 @@ static int _hist_entry__dso_snprintf(struct map *map, char *bf,
                                     size_t size, unsigned int width)
 {
        if (map && map->dso) {
-               const char *dso_name = !verbose ? map->dso->short_name :
-                       map->dso->long_name;
+               const char *dso_name = verbose > 0 ? map->dso->long_name :
+                       map->dso->short_name;
                return repsep_snprintf(bf, size, "%-*.*s", width, width, dso_name);
        }
 
@@ -261,7 +261,7 @@ static int _hist_entry__sym_snprintf(struct map *map, struct symbol *sym,
 {
        size_t ret = 0;
 
-       if (verbose) {
+       if (verbose > 0) {
                char o = map ? dso__symtab_origin(map->dso) : '!';
                ret += repsep_snprintf(bf, size, "%-#*llx %c ",
                                       BITS_PER_LONG / 4 + 2, ip, o);
index 39345c2ddfc22edcfde844e5eb95b72930f4eeda..0d51334a9b4628090f35ffe4da921b4e0ecb15b0 100644 (file)
@@ -344,7 +344,7 @@ int perf_stat_process_counter(struct perf_stat_config *config,
        for (i = 0; i < 3; i++)
                update_stats(&ps->res_stats[i], count[i]);
 
-       if (verbose) {
+       if (verbose > 0) {
                fprintf(config->output, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n",
                        perf_evsel__name(counter), count[0], count[1], count[2]);
        }
index adbc6c02c3aaac757028e6bbe1ae63cd11ee437a..4e59ddeb4eda7cd8e75ec2c0c42e1fec4de0432a 100644 (file)
@@ -213,7 +213,7 @@ static bool want_demangle(bool is_kernel_sym)
 
 static char *demangle_sym(struct dso *dso, int kmodule, const char *elf_name)
 {
-       int demangle_flags = verbose ? (DMGL_PARAMS | DMGL_ANSI) : DMGL_NO_OPTS;
+       int demangle_flags = verbose > 0 ? (DMGL_PARAMS | DMGL_ANSI) : DMGL_NO_OPTS;
        char *demangled = NULL;
 
        /*