size.
rsize=X
- Specify the maximum read size in bytes. By default there is no
- maximum.
+ Specify the maximum read size in bytes. Default: 64 MB.
rasize=X
- Specify the maximum readahead.
+ Specify the maximum readahead. Default: 8 MB.
mount_timeout=X
Specify the timeout value for mount (in seconds), in the case
kernel and the tasks running there get 50% of the cache.
# echo C0 > p0/cpus
+
+4) Locking between applications
+
+Certain operations on the resctrl filesystem, composed of read/writes
+to/from multiple files, must be atomic.
+
+As an example, the allocation of an exclusive reservation of L3 cache
+involves:
+
+ 1. Read the cbmmasks from each directory
+ 2. Find a contiguous set of bits in the global CBM bitmask that is clear
+ in any of the directory cbmmasks
+ 3. Create a new directory
+ 4. Set the bits found in step 2 to the new directory "schemata" file
+
+If two applications attempt to allocate space concurrently then they can
+end up allocating the same bits so the reservations are shared instead of
+exclusive.
+
+To coordinate atomic operations on the resctrlfs and to avoid the problem
+above, the following locking procedure is recommended:
+
+Locking is based on flock, which is available in libc and also as a shell
+script command
+
+Write lock:
+
+ A) Take flock(LOCK_EX) on /sys/fs/resctrl
+ B) Read/write the directory structure.
+ C) funlock
+
+Read lock:
+
+ A) Take flock(LOCK_SH) on /sys/fs/resctrl
+ B) If success read the directory structure.
+ C) funlock
+
+Example with bash:
+
+# Atomically read directory structure
+$ flock -s /sys/fs/resctrl/ find /sys/fs/resctrl
+
+# Read directory contents and create new subdirectory
+
+$ cat create-dir.sh
+find /sys/fs/resctrl/ > output.txt
+mask = function-of(output.txt)
+mkdir /sys/fs/resctrl/newres/
+echo mask > /sys/fs/resctrl/newres/schemata
+
+$ flock /sys/fs/resctrl/ ./create-dir.sh
+
+Example with C:
+
+/*
+ * Example code do take advisory locks
+ * before accessing resctrl filesystem
+ */
+#include <sys/file.h>
+#include <stdlib.h>
+
+void resctrl_take_shared_lock(int fd)
+{
+ int ret;
+
+ /* take shared lock on resctrl filesystem */
+ ret = flock(fd, LOCK_SH);
+ if (ret) {
+ perror("flock");
+ exit(-1);
+ }
+}
+
+void resctrl_take_exclusive_lock(int fd)
+{
+ int ret;
+
+ /* release lock on resctrl filesystem */
+ ret = flock(fd, LOCK_EX);
+ if (ret) {
+ perror("flock");
+ exit(-1);
+ }
+}
+
+void resctrl_release_lock(int fd)
+{
+ int ret;
+
+ /* take shared lock on resctrl filesystem */
+ ret = flock(fd, LOCK_UN);
+ if (ret) {
+ perror("flock");
+ exit(-1);
+ }
+}
+
+void main(void)
+{
+ int fd, ret;
+
+ fd = open("/sys/fs/resctrl", O_DIRECTORY);
+ if (fd == -1) {
+ perror("open");
+ exit(-1);
+ }
+ resctrl_take_shared_lock(fd);
+ /* code to read directory contents */
+ resctrl_release_lock(fd);
+
+ resctrl_take_exclusive_lock(fd);
+ /* code to read and write directory contents */
+ resctrl_release_lock(fd);
+}
endif
export mod_sign_cmd
+ifdef CONFIG_STACK_VALIDATION
+ has_libelf := $(call try-run,\
+ echo "int main() {}" | $(HOSTCC) -xc -o /dev/null -lelf -,1,0)
+ ifeq ($(has_libelf),1)
+ objtool_target := tools/objtool FORCE
+ else
+ $(warning "Cannot use CONFIG_STACK_VALIDATION, please install libelf-dev, libelf-devel or elfutils-libelf-devel")
+ SKIP_STACK_VALIDATION := 1
+ export SKIP_STACK_VALIDATION
+ endif
+endif
+
ifeq ($(KBUILD_EXTMOD),)
core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/
# All the preparing..
prepare: prepare0 prepare-objtool
-ifdef CONFIG_STACK_VALIDATION
- has_libelf := $(call try-run,\
- echo "int main() {}" | $(HOSTCC) -xc -o /dev/null -lelf -,1,0)
- ifeq ($(has_libelf),1)
- objtool_target := tools/objtool FORCE
- else
- $(warning "Cannot use CONFIG_STACK_VALIDATION, please install libelf-dev, libelf-devel or elfutils-libelf-devel")
- SKIP_STACK_VALIDATION := 1
- export SKIP_STACK_VALIDATION
- endif
-endif
-
PHONY += prepare-objtool
prepare-objtool: $(objtool_target)
bool
default y
select ARCH_CLOCKSOURCE_DATA
+ select ARCH_HAS_DEBUG_VIRTUAL
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_SET_MEMORY
used instead of the auto-probing which utilizes the register.
config REMAP_VECTORS_TO_RAM
- bool 'Install vectors to the beginning of RAM' if DRAM_BASE
- depends on DRAM_BASE
+ bool 'Install vectors to the beginning of RAM'
help
The kernel needs to change the hardware exception vectors.
In nommu mode, the hardware exception vectors are normally
/* Not needed, but used in some headers pulled in by decompressors */
extern char * strstr(const char * s1, const char *s2);
+extern size_t strlen(const char *s);
#ifdef CONFIG_KERNEL_GZIP
#include "../../../../lib/decompress_inflate.c"
void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr)
{
- unsigned long val = ptr ? virt_to_phys(ptr) : 0;
+ unsigned long val = ptr ? __pa_symbol(ptr) : 0;
mcpm_entry_vectors[cluster][cpu] = val;
sync_cache_w(&mcpm_entry_vectors[cluster][cpu]);
}
* the kernel as if the power_up method just had deasserted reset
* on the CPU.
*/
- phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset);
- phys_reset(virt_to_phys(mcpm_entry_point));
+ phys_reset = (phys_reset_t)(unsigned long)__pa_symbol(cpu_reset);
+ phys_reset(__pa_symbol(mcpm_entry_point));
/* should never get here */
BUG();
__mcpm_outbound_leave_critical(cluster, CLUSTER_DOWN);
__mcpm_cpu_down(cpu, cluster);
- phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset);
- phys_reset(virt_to_phys(mcpm_entry_point));
+ phys_reset = (phys_reset_t)(unsigned long)__pa_symbol(cpu_reset);
+ phys_reset(__pa_symbol(mcpm_entry_point));
BUG();
}
sync_cache_w(&mcpm_sync);
if (power_up_setup) {
- mcpm_power_up_setup_phys = virt_to_phys(power_up_setup);
+ mcpm_power_up_setup_phys = __pa_symbol(power_up_setup);
sync_cache_w(&mcpm_power_up_setup_phys);
}
#ifndef __CACHE_UNIPHIER_H
#define __CACHE_UNIPHIER_H
-#include <linux/types.h>
+#include <linux/errno.h>
#ifdef CONFIG_CACHE_UNIPHIER
int uniphier_cache_init(void);
#define IOREMAP_MAX_ORDER 24
#endif
+#define VECTORS_BASE UL(0xffff0000)
+
#else /* CONFIG_MMU */
+#ifndef __ASSEMBLY__
+extern unsigned long vectors_base;
+#define VECTORS_BASE vectors_base
+#endif
+
/*
* The limitation of user task size can grow up to the end of free ram region.
* It is difficult to define and perhaps will never meet the original meaning
#endif /* !CONFIG_MMU */
+#ifdef CONFIG_XIP_KERNEL
+#define KERNEL_START _sdata
+#else
+#define KERNEL_START _stext
+#endif
+#define KERNEL_END _end
+
/*
* We fix the TCM memories max 32 KiB ITCM resp DTCM at these
* locations
: "r" (x), "I" (__PV_BITS_31_24) \
: "cc")
-static inline phys_addr_t __virt_to_phys(unsigned long x)
+static inline phys_addr_t __virt_to_phys_nodebug(unsigned long x)
{
phys_addr_t t;
#define PHYS_OFFSET PLAT_PHYS_OFFSET
#define PHYS_PFN_OFFSET ((unsigned long)(PHYS_OFFSET >> PAGE_SHIFT))
-static inline phys_addr_t __virt_to_phys(unsigned long x)
+static inline phys_addr_t __virt_to_phys_nodebug(unsigned long x)
{
return (phys_addr_t)x - PAGE_OFFSET + PHYS_OFFSET;
}
((((unsigned long)(kaddr) - PAGE_OFFSET) >> PAGE_SHIFT) + \
PHYS_PFN_OFFSET)
+#define __pa_symbol_nodebug(x) __virt_to_phys_nodebug((x))
+
+#ifdef CONFIG_DEBUG_VIRTUAL
+extern phys_addr_t __virt_to_phys(unsigned long x);
+extern phys_addr_t __phys_addr_symbol(unsigned long x);
+#else
+#define __virt_to_phys(x) __virt_to_phys_nodebug(x)
+#define __phys_addr_symbol(x) __pa_symbol_nodebug(x)
+#endif
+
/*
* These are *only* valid on the kernel direct mapped RAM memory.
* Note: Drivers should NOT use these. They are the wrong
* Drivers should NOT use these either.
*/
#define __pa(x) __virt_to_phys((unsigned long)(x))
+#define __pa_symbol(x) __phys_addr_symbol(RELOC_HIDE((unsigned long)(x), 0))
#define __va(x) ((void *)__phys_to_virt((phys_addr_t)(x)))
#define pfn_to_kaddr(pfn) __va((phys_addr_t)(pfn) << PAGE_SHIFT)
/*
* Mark the prot value as uncacheable and unbufferable.
*/
-#define pgprot_noncached(prot) __pgprot(0)
-#define pgprot_writecombine(prot) __pgprot(0)
-#define pgprot_dmacoherent(prot) __pgprot(0)
+#define pgprot_noncached(prot) (prot)
+#define pgprot_writecombine(prot) (prot)
+#define pgprot_dmacoherent(prot) (prot)
/*
#endif
#ifdef CONFIG_CPU_ICACHE_DISABLE
bic r0, r0, #CR_I
-#endif
-#ifdef CONFIG_CPU_HIGH_VECTOR
- orr r0, r0, #CR_V
-#else
- bic r0, r0, #CR_V
#endif
mcr p15, 0, r0, c1, c0, 0 @ write control reg
#elif defined (CONFIG_CPU_V7M)
break;
case R_ARM_PREL31:
- offset = *(u32 *)loc + sym->st_value - loc;
- *(u32 *)loc = offset & 0x7fffffff;
+ offset = (*(s32 *)loc << 1) >> 1; /* sign extend */
+ offset += sym->st_value - loc;
+ if (offset >= 0x40000000 || offset < -0x40000000) {
+ pr_err("%s: section %u reloc %u sym '%s': relocation %u out of range (%#lx -> %#x)\n",
+ module->name, relindex, i, symname,
+ ELF32_R_TYPE(rel->r_info), loc,
+ sym->st_value);
+ return -ENOEXEC;
+ }
+ *(u32 *)loc &= 0x80000000;
+ *(u32 *)loc |= offset & 0x7fffffff;
break;
case R_ARM_MOVW_ABS_NC:
extern void init_default_cache_policy(unsigned long);
extern void paging_init(const struct machine_desc *desc);
extern void early_paging_init(const struct machine_desc *);
-extern void sanity_check_meminfo(void);
+extern void adjust_lowmem_bounds(void);
extern enum reboot_mode reboot_mode;
extern void setup_dma_zone(const struct machine_desc *desc);
setup_dma_zone(mdesc);
xen_early_init();
efi_init();
- sanity_check_meminfo();
+ /*
+ * Make sure the calculation for lowmem/highmem is set appropriately
+ * before reserving/allocating any mmeory
+ */
+ adjust_lowmem_bounds();
arm_memblock_init(mdesc);
+ /* Memory may have been removed so recalculate the bounds. */
+ adjust_lowmem_bounds();
early_ioremap_reset();
pr_err("CPU%u: cpu didn't die\n", cpu);
return;
}
- pr_notice("CPU%u: shutdown\n", cpu);
+ pr_debug("CPU%u: shutdown\n", cpu);
/*
* platform_cpu_kill() is generally expected to do the powering off
{
phys_addr_t addr;
- addr = virt_to_phys(secondary_startup);
+ addr = __pa_symbol(secondary_startup);
if (addr > (phys_addr_t)(uint32_t)(-1)) {
pr_err("FAIL: resume address over 32bit (%pa)", &addr);
static void write_release_addr(u32 release_phys)
{
u32 *virt = (u32 *) phys_to_virt(release_phys);
- writel_relaxed(virt_to_phys(secondary_startup), virt);
+ writel_relaxed(__pa_symbol(secondary_startup), virt);
/* Make sure this store is visible to other CPUs */
smp_wmb();
__cpuc_flush_dcache_area(virt, sizeof(u32));
}
/* Write the secondary init routine to the BootLUT reset vector */
- val = virt_to_phys(secondary_startup);
+ val = __pa_symbol(secondary_startup);
writel_relaxed(val, bootlut_base + BOOTLUT_RESET_VECT);
/* Power up the core, will jump straight to its reset vector when we
* Set the reset vector to point to the secondary_startup
* routine
*/
- cpu_set_boot_addr(cpu, virt_to_phys(secondary_startup));
+ cpu_set_boot_addr(cpu, __pa_symbol(secondary_startup));
/* Unhalt the cpu */
cpu_rst_cfg_set(cpu, 0);
return -ENOMEM;
}
- secondary_startup_phy = virt_to_phys(secondary_startup);
+ secondary_startup_phy = __pa_symbol(secondary_startup);
BUG_ON(secondary_startup_phy > (phys_addr_t)U32_MAX);
writel_relaxed(secondary_startup_phy, sku_rom_lut);
* Secondary cores will start in secondary_startup(),
* defined in "arch/arm/kernel/head.S"
*/
- boot_func = virt_to_phys(secondary_startup);
+ boot_func = __pa_symbol(secondary_startup);
BUG_ON(boot_func & BOOT_ADDR_CPUID_MASK);
BUG_ON(boot_func > (phys_addr_t)U32_MAX);
#include <asm/cacheflush.h>
#include <asm/cp15.h>
+#include <asm/memory.h>
#include <asm/smp_plat.h>
#include <asm/smp_scu.h>
if (!cpu_ctrl)
goto unmap_scu;
- vectors_base = ioremap(CONFIG_VECTORS_BASE, SZ_32K);
+ vectors_base = ioremap(VECTORS_BASE, SZ_32K);
if (!vectors_base)
goto unmap_scu;
* Write the secondary startup address into the SW reset address
* vector. This is used by boot_inst.
*/
- writel(virt_to_phys(secondary_startup), vectors_base + SW_RESET_ADDR);
+ writel(__pa_symbol(secondary_startup), vectors_base + SW_RESET_ADDR);
iounmap(vectors_base);
unmap_scu:
case FW_DO_IDLE_AFTR:
if (read_cpuid_part() == ARM_CPU_PART_CORTEX_A9)
exynos_save_cp15();
- writel_relaxed(virt_to_phys(exynos_cpu_resume_ns),
+ writel_relaxed(__pa_symbol(exynos_cpu_resume_ns),
sysram_ns_base_addr + 0x24);
writel_relaxed(EXYNOS_AFTR_MAGIC, sysram_ns_base_addr + 0x20);
if (soc_is_exynos3250()) {
exynos_save_cp15();
writel(EXYNOS_SLEEP_MAGIC, sysram_ns_base_addr + EXYNOS_BOOT_FLAG);
- writel(virt_to_phys(exynos_cpu_resume_ns),
+ writel(__pa_symbol(exynos_cpu_resume_ns),
sysram_ns_base_addr + EXYNOS_BOOT_ADDR);
return cpu_suspend(0, exynos_cpu_suspend);
*/
__raw_writel(0xe59f0000, ns_sram_base_addr); /* ldr r0, [pc, #0] */
__raw_writel(0xe12fff10, ns_sram_base_addr + 4); /* bx r0 */
- __raw_writel(virt_to_phys(mcpm_entry_point), ns_sram_base_addr + 8);
+ __raw_writel(__pa_symbol(mcpm_entry_point), ns_sram_base_addr + 8);
}
static struct syscore_ops exynos_mcpm_syscore_ops = {
smp_rmb();
- boot_addr = virt_to_phys(exynos4_secondary_startup);
+ boot_addr = __pa_symbol(exynos4_secondary_startup);
ret = exynos_set_boot_addr(core_id, boot_addr);
if (ret)
mpidr = cpu_logical_map(i);
core_id = MPIDR_AFFINITY_LEVEL(mpidr, 0);
- boot_addr = virt_to_phys(exynos4_secondary_startup);
+ boot_addr = __pa_symbol(exynos4_secondary_startup);
ret = exynos_set_boot_addr(core_id, boot_addr);
if (ret)
static void exynos_cpu_set_boot_vector(long flags)
{
- writel_relaxed(virt_to_phys(exynos_cpu_resume),
+ writel_relaxed(__pa_symbol(exynos_cpu_resume),
exynos_boot_vector_addr());
writel_relaxed(flags, exynos_boot_vector_flag());
}
abort:
if (cpu_online(1)) {
- unsigned long boot_addr = virt_to_phys(exynos_cpu_resume);
+ unsigned long boot_addr = __pa_symbol(exynos_cpu_resume);
/*
* Set the boot vector to something non-zero
static void exynos_pre_enter_aftr(void)
{
- unsigned long boot_addr = virt_to_phys(exynos_cpu_resume);
+ unsigned long boot_addr = __pa_symbol(exynos_cpu_resume);
(void)exynos_set_boot_addr(1, boot_addr);
}
exynos_pm_enter_sleep_mode();
/* ensure at least INFORM0 has the resume address */
- pmu_raw_writel(virt_to_phys(exynos_cpu_resume), S5P_INFORM0);
+ pmu_raw_writel(__pa_symbol(exynos_cpu_resume), S5P_INFORM0);
}
static void exynos3250_pm_prepare(void)
exynos_pm_enter_sleep_mode();
/* ensure at least INFORM0 has the resume address */
- pmu_raw_writel(virt_to_phys(exynos_cpu_resume), S5P_INFORM0);
+ pmu_raw_writel(__pa_symbol(exynos_cpu_resume), S5P_INFORM0);
}
static void exynos5420_pm_prepare(void)
/* ensure at least INFORM0 has the resume address */
if (IS_ENABLED(CONFIG_EXYNOS5420_MCPM))
- pmu_raw_writel(virt_to_phys(mcpm_entry_point), S5P_INFORM0);
+ pmu_raw_writel(__pa_symbol(mcpm_entry_point), S5P_INFORM0);
tmp = pmu_raw_readl(EXYNOS_L2_OPTION(0));
tmp &= ~EXYNOS_L2_USE_RETENTION;
*/
writel_relaxed(hip04_boot_method[0], relocation);
writel_relaxed(0xa5a5a5a5, relocation + 4); /* magic number */
- writel_relaxed(virt_to_phys(secondary_startup), relocation + 8);
+ writel_relaxed(__pa_symbol(secondary_startup), relocation + 8);
writel_relaxed(0, relocation + 12);
iounmap(relocation);
cpu = cpu_logical_map(cpu);
if (!cpu || !ctrl_base)
return;
- writel_relaxed(virt_to_phys(jump_addr), ctrl_base + ((cpu - 1) << 2));
+ writel_relaxed(__pa_symbol(jump_addr), ctrl_base + ((cpu - 1) << 2));
}
int hi3xxx_get_cpu_jump(int cpu)
{
phys_addr_t jumpaddr;
- jumpaddr = virt_to_phys(secondary_startup);
+ jumpaddr = __pa_symbol(secondary_startup);
hix5hd2_set_scu_boot_addr(HIX5HD2_BOOT_ADDRESS, jumpaddr);
hix5hd2_set_cpu(cpu, true);
arch_send_wakeup_ipi_mask(cpumask_of(cpu));
struct device_node *node;
- jumpaddr = virt_to_phys(secondary_startup);
+ jumpaddr = __pa_symbol(secondary_startup);
hip01_set_boot_addr(HIP01_BOOT_ADDRESS, jumpaddr);
node = of_find_compatible_node(NULL, NULL, "hisilicon,hip01-sysctrl");
dcfg_base = of_iomap(np, 0);
BUG_ON(!dcfg_base);
- paddr = virt_to_phys(secondary_startup);
+ paddr = __pa_symbol(secondary_startup);
writel_relaxed(cpu_to_be32(paddr), dcfg_base + DCFG_CCSR_SCRATCHRW1);
iounmap(dcfg_base);
memset(suspend_ocram_base, 0, sizeof(*pm_info));
pm_info = suspend_ocram_base;
pm_info->pbase = ocram_pbase;
- pm_info->resume_addr = virt_to_phys(v7_cpu_resume);
+ pm_info->resume_addr = __pa_symbol(v7_cpu_resume);
pm_info->pm_info_size = sizeof(*pm_info);
/*
void imx_set_cpu_jump(int cpu, void *jump_addr)
{
cpu = cpu_logical_map(cpu);
- writel_relaxed(virt_to_phys(jump_addr),
+ writel_relaxed(__pa_symbol(jump_addr),
src_base + SRC_GPR1 + cpu * 8);
}
* write the address of slave startup address into the system-wide
* jump register
*/
- writel_relaxed(virt_to_phys(secondary_startup_arm),
+ writel_relaxed(__pa_symbol(secondary_startup_arm),
mtk_smp_base + mtk_smp_info->jump_reg);
}
{
phys_addr_t resume_pc;
- resume_pc = virt_to_phys(armada_370_xp_cpu_resume);
+ resume_pc = __pa_symbol(armada_370_xp_cpu_resume);
/*
* The bootloader expects the first two words to be a magic
void mvebu_pmsu_set_cpu_boot_addr(int hw_cpu, void *boot_addr)
{
- writel(virt_to_phys(boot_addr), pmsu_mp_base +
+ writel(__pa_symbol(boot_addr), pmsu_mp_base +
PMSU_BOOT_ADDR_REDIRECT_OFFSET(hw_cpu));
}
if (of_machine_is_compatible("marvell,armada375"))
mvebu_armada375_smp_wa_init();
- writel(virt_to_phys(boot_addr), system_controller_base +
+ writel(__pa_symbol(boot_addr), system_controller_base +
mvebu_sc->resume_boot_addr);
}
#endif
scratchpad_contents.boot_config_ptr = 0x0;
if (cpu_is_omap3630())
scratchpad_contents.public_restore_ptr =
- virt_to_phys(omap3_restore_3630);
+ __pa_symbol(omap3_restore_3630);
else if (omap_rev() != OMAP3430_REV_ES3_0 &&
omap_rev() != OMAP3430_REV_ES3_1 &&
omap_rev() != OMAP3430_REV_ES3_1_2)
scratchpad_contents.public_restore_ptr =
- virt_to_phys(omap3_restore);
+ __pa_symbol(omap3_restore);
else
scratchpad_contents.public_restore_ptr =
- virt_to_phys(omap3_restore_es3);
+ __pa_symbol(omap3_restore_es3);
if (omap_type() == OMAP2_DEVICE_TYPE_GP)
scratchpad_contents.secure_ram_restore_ptr = 0x0;
sdrc_block_contents.flags = 0x0;
sdrc_block_contents.block_size = 0x0;
- arm_context_addr = virt_to_phys(omap3_arm_context);
+ arm_context_addr = __pa_symbol(omap3_arm_context);
/* Copy all the contents to the scratchpad location */
scratchpad_address = OMAP2_L4_IO_ADDRESS(OMAP343X_SCRATCHPAD);
cpu_clear_prev_logic_pwrst(cpu);
pwrdm_set_next_pwrst(pm_info->pwrdm, power_state);
pwrdm_set_logic_retst(pm_info->pwrdm, cpu_logic_state);
- set_cpu_wakeup_addr(cpu, virt_to_phys(omap_pm_ops.resume));
+ set_cpu_wakeup_addr(cpu, __pa_symbol(omap_pm_ops.resume));
omap_pm_ops.scu_prepare(cpu, power_state);
l2x0_pwrst_prepare(cpu, save_state);
pwrdm_clear_all_prev_pwrst(pm_info->pwrdm);
pwrdm_set_next_pwrst(pm_info->pwrdm, power_state);
- set_cpu_wakeup_addr(cpu, virt_to_phys(omap_pm_ops.hotplug_restart));
+ set_cpu_wakeup_addr(cpu, __pa_symbol(omap_pm_ops.hotplug_restart));
omap_pm_ops.scu_prepare(cpu, power_state);
/*
sar_base = omap4_get_sar_ram_base();
if (cpu_is_omap443x())
- startup_pa = virt_to_phys(omap4_secondary_startup);
+ startup_pa = __pa_symbol(omap4_secondary_startup);
else if (cpu_is_omap446x())
- startup_pa = virt_to_phys(omap4460_secondary_startup);
+ startup_pa = __pa_symbol(omap4460_secondary_startup);
else if ((__boot_cpu_mode & MODE_MASK) == HYP_MODE)
- startup_pa = virt_to_phys(omap5_secondary_hyp_startup);
+ startup_pa = __pa_symbol(omap5_secondary_hyp_startup);
else
- startup_pa = virt_to_phys(omap5_secondary_startup);
+ startup_pa = __pa_symbol(omap5_secondary_startup);
if (cpu_is_omap44xx())
writel_relaxed(startup_pa, sar_base +
* A barrier is added to ensure that write buffer is drained
*/
if (omap_secure_apis_support())
- omap_auxcoreboot_addr(virt_to_phys(cfg.startup_addr));
+ omap_auxcoreboot_addr(__pa_symbol(cfg.startup_addr));
else
- writel_relaxed(virt_to_phys(cfg.startup_addr),
+ writel_relaxed(__pa_symbol(cfg.startup_addr),
base + OMAP_AUX_CORE_BOOT_1);
}
* waiting for. This would wake up the secondary core from WFE
*/
#define SIRFSOC_CPU1_JUMPADDR_OFFSET 0x2bc
- __raw_writel(virt_to_phys(sirfsoc_secondary_startup),
+ __raw_writel(__pa_symbol(sirfsoc_secondary_startup),
clk_base + SIRFSOC_CPU1_JUMPADDR_OFFSET);
#define SIRFSOC_CPU1_WAKEMAGIC_OFFSET 0x2b8
static int sirfsoc_pre_suspend_power_off(void)
{
- u32 wakeup_entry = virt_to_phys(cpu_resume);
+ u32 wakeup_entry = __pa_symbol(cpu_resume);
sirfsoc_rtc_iobrg_writel(wakeup_entry, sirfsoc_pwrc_base +
SIRFSOC_PWRC_SCRATCH_PAD1);
store_ptr = *PALMZ72_SAVE_DWORD;
/* Setting PSPR to a proper value */
- PSPR = virt_to_phys(&palmz72_resume_info);
+ PSPR = __pa_symbol(&palmz72_resume_info);
return 0;
}
static int pxa25x_cpu_pm_prepare(void)
{
/* set resume return address */
- PSPR = virt_to_phys(cpu_resume);
+ PSPR = __pa_symbol(cpu_resume);
return 0;
}
static int pxa27x_cpu_pm_prepare(void)
{
/* set resume return address */
- PSPR = virt_to_phys(cpu_resume);
+ PSPR = __pa_symbol(cpu_resume);
return 0;
}
PSPR = 0x5c014000;
/* overwrite with the resume address */
- *p = virt_to_phys(cpu_resume);
+ *p = __pa_symbol(cpu_resume);
cpu_suspend(0, pxa3xx_finish_suspend);
}
/* Put the boot address in this magic register */
regmap_write(map, REALVIEW_SYS_FLAGSSET_OFFSET,
- virt_to_phys(versatile_secondary_startup));
+ __pa_symbol(versatile_secondary_startup));
}
static const struct smp_operations realview_dt_smp_ops __initconst = {
*/
mdelay(1); /* ensure the cpus other than cpu0 to startup */
- writel(virt_to_phys(secondary_startup), sram_base_addr + 8);
+ writel(__pa_symbol(secondary_startup), sram_base_addr + 8);
writel(0xDEADBEAF, sram_base_addr + 4);
dsb_sev();
}
}
/* set the boot function for the sram code */
- rockchip_boot_fn = virt_to_phys(secondary_startup);
+ rockchip_boot_fn = __pa_symbol(secondary_startup);
/* copy the trampoline to sram, that runs during startup of the core */
memcpy(sram_base_addr, &rockchip_secondary_trampoline, trampoline_sz);
static void rk3288_config_bootdata(void)
{
rkpm_bootdata_cpusp = rk3288_bootram_phy + (SZ_4K - 8);
- rkpm_bootdata_cpu_code = virt_to_phys(cpu_resume);
+ rkpm_bootdata_cpu_code = __pa_symbol(cpu_resume);
rkpm_bootdata_l2ctlr_f = 1;
rkpm_bootdata_l2ctlr = rk3288_l2_config();
* correct address to resume from. */
__raw_writel(0x2BED, S3C2412_INFORM0);
- __raw_writel(virt_to_phys(s3c_cpu_resume), S3C2412_INFORM1);
+ __raw_writel(__pa_symbol(s3c_cpu_resume), S3C2412_INFORM1);
return 0;
}
{
/* ensure at least GSTATUS3 has the resume address */
- __raw_writel(virt_to_phys(s3c_cpu_resume), S3C2410_GSTATUS3);
+ __raw_writel(__pa_symbol(s3c_cpu_resume), S3C2410_GSTATUS3);
S3C_PMDBG("GSTATUS3 0x%08x\n", __raw_readl(S3C2410_GSTATUS3));
S3C_PMDBG("GSTATUS4 0x%08x\n", __raw_readl(S3C2410_GSTATUS4));
* correct address to resume from.
*/
__raw_writel(0x2BED, S3C2412_INFORM0);
- __raw_writel(virt_to_phys(s3c_cpu_resume), S3C2412_INFORM1);
+ __raw_writel(__pa_symbol(s3c_cpu_resume), S3C2412_INFORM1);
}
static int s3c2416_pm_add(struct device *dev, struct subsys_interface *sif)
wake_irqs, ARRAY_SIZE(wake_irqs));
/* store address of resume. */
- __raw_writel(virt_to_phys(s3c_cpu_resume), S3C64XX_INFORM0);
+ __raw_writel(__pa_symbol(s3c_cpu_resume), S3C64XX_INFORM0);
/* ensure previous wakeup state is cleared before sleeping */
__raw_writel(__raw_readl(S3C64XX_WAKEUP_STAT), S3C64XX_WAKEUP_STAT);
__raw_writel(s5pv210_irqwake_intmask, S5P_WAKEUP_MASK);
/* ensure at least INFORM0 has the resume address */
- __raw_writel(virt_to_phys(s5pv210_cpu_resume), S5P_INFORM0);
+ __raw_writel(__pa_symbol(s5pv210_cpu_resume), S5P_INFORM0);
tmp = __raw_readl(S5P_SLEEP_CFG);
tmp &= ~(S5P_SLEEP_CFG_OSC_EN | S5P_SLEEP_CFG_USBOSC_EN);
RCSR = RCSR_HWR | RCSR_SWR | RCSR_WDR | RCSR_SMR;
/* set resume return address */
- PSPR = virt_to_phys(cpu_resume);
+ PSPR = __pa_symbol(cpu_resume);
/* go zzz */
cpu_suspend(0, sa1100_finish_suspend);
static void __init shmobile_smp_apmu_setup_boot(void)
{
/* install boot code shared by all CPUs */
- shmobile_boot_fn = virt_to_phys(shmobile_smp_boot);
+ shmobile_boot_fn = __pa_symbol(shmobile_smp_boot);
}
void __init shmobile_smp_apmu_prepare_cpus(unsigned int max_cpus,
int shmobile_smp_apmu_boot_secondary(unsigned int cpu, struct task_struct *idle)
{
/* For this particular CPU register boot vector */
- shmobile_smp_hook(cpu, virt_to_phys(secondary_startup), 0);
+ shmobile_smp_hook(cpu, __pa_symbol(secondary_startup), 0);
return apmu_wrap(cpu, apmu_power_on);
}
#if defined(CONFIG_SUSPEND)
static int shmobile_smp_apmu_do_suspend(unsigned long cpu)
{
- shmobile_smp_hook(cpu, virt_to_phys(cpu_resume), 0);
+ shmobile_smp_hook(cpu, __pa_symbol(cpu_resume), 0);
shmobile_smp_apmu_cpu_shutdown(cpu);
cpu_do_idle(); /* WFI selects Core Standby */
return 1;
static int shmobile_scu_cpu_prepare(unsigned int cpu)
{
/* For this particular CPU register SCU SMP boot vector */
- shmobile_smp_hook(cpu, virt_to_phys(shmobile_boot_scu),
+ shmobile_smp_hook(cpu, __pa_symbol(shmobile_boot_scu),
shmobile_scu_base_phys);
return 0;
}
unsigned int max_cpus)
{
/* install boot code shared by all CPUs */
- shmobile_boot_fn = virt_to_phys(shmobile_smp_boot);
+ shmobile_boot_fn = __pa_symbol(shmobile_smp_boot);
/* enable SCU and cache coherency on booting CPU */
shmobile_scu_base_phys = scu_base_phys;
memcpy(phys_to_virt(0), &secondary_trampoline, trampoline_size);
- writel(virt_to_phys(secondary_startup),
+ writel(__pa_symbol(secondary_startup),
sys_manager_base_addr + (socfpga_cpu1start_addr & 0x000000ff));
flush_cache_all();
SOCFPGA_A10_RSTMGR_MODMPURST);
memcpy(phys_to_virt(0), &secondary_trampoline, trampoline_size);
- writel(virt_to_phys(secondary_startup),
+ writel(__pa_symbol(secondary_startup),
sys_manager_base_addr + (socfpga_cpu1start_addr & 0x00000fff));
flush_cache_all();
* (presently it is in SRAM). The BootMonitor waits until it receives a
* soft interrupt, and then the secondary CPU branches to this address.
*/
- __raw_writel(virt_to_phys(spear13xx_secondary_startup), SYS_LOCATION);
+ __raw_writel(__pa_symbol(spear13xx_secondary_startup), SYS_LOCATION);
}
const struct smp_operations spear13xx_smp_ops __initconst = {
u32 __iomem *cpu_strt_ptr;
u32 release_phys;
int cpu;
- unsigned long entry_pa = virt_to_phys(sti_secondary_startup);
+ unsigned long entry_pa = __pa_symbol(sti_secondary_startup);
np = of_find_compatible_node(NULL, NULL, "arm,cortex-a9-scu");
spin_lock(&cpu_lock);
/* Set CPU boot address */
- writel(virt_to_phys(secondary_startup),
+ writel(__pa_symbol(secondary_startup),
cpucfg_membase + CPUCFG_PRIVATE0_REG);
/* Assert the CPU core in reset */
spin_lock(&cpu_lock);
/* Set CPU boot address */
- writel(virt_to_phys(secondary_startup),
+ writel(__pa_symbol(secondary_startup),
cpucfg_membase + CPUCFG_PRIVATE0_REG);
/* Assert the CPU core in reset */
static int tango_boot_secondary(unsigned int cpu, struct task_struct *idle)
{
- tango_set_aux_boot_addr(virt_to_phys(secondary_startup));
+ tango_set_aux_boot_addr(__pa_symbol(secondary_startup));
tango_start_aux_core(cpu);
return 0;
}
static int tango_pm_powerdown(unsigned long arg)
{
- tango_suspend(virt_to_phys(cpu_resume));
+ tango_suspend(__pa_symbol(cpu_resume));
return -EIO; /* tango_suspend has failed */
}
__tegra_cpu_reset_handler_data[TEGRA_RESET_MASK_PRESENT] =
*((u32 *)cpu_possible_mask);
__tegra_cpu_reset_handler_data[TEGRA_RESET_STARTUP_SECONDARY] =
- virt_to_phys((void *)secondary_startup);
+ __pa_symbol((void *)secondary_startup);
#endif
#ifdef CONFIG_PM_SLEEP
__tegra_cpu_reset_handler_data[TEGRA_RESET_STARTUP_LP1] =
TEGRA_IRAM_LPx_RESUME_AREA;
__tegra_cpu_reset_handler_data[TEGRA_RESET_STARTUP_LP2] =
- virt_to_phys((void *)tegra_resume);
+ __pa_symbol((void *)tegra_resume);
#endif
tegra_cpu_reset_handler_enable();
* backup ram register at offset 0x1FF0, which is what boot rom code
* is waiting for. This will wake up the secondary core from WFE.
*/
- writel(virt_to_phys(secondary_startup),
+ writel(__pa_symbol(secondary_startup),
backupram + UX500_CPU1_JUMPADDR_OFFSET);
writel(0xA1FEED01,
backupram + UX500_CPU1_WAKEMAGIC_OFFSET);
* Future entries into the kernel can now go
* through the cluster entry vectors.
*/
- vexpress_flags_set(virt_to_phys(mcpm_entry_point));
+ vexpress_flags_set(__pa_symbol(mcpm_entry_point));
return 0;
}
* until it receives a soft interrupt, and then the
* secondary CPU branches to this address.
*/
- vexpress_flags_set(virt_to_phys(versatile_secondary_startup));
+ vexpress_flags_set(__pa_symbol(versatile_secondary_startup));
}
const struct smp_operations vexpress_smp_dt_ops __initconst = {
if (cluster >= TC2_CLUSTERS || cpu >= tc2_nr_cpus[cluster])
return -EINVAL;
ve_spc_set_resume_addr(cluster, cpu,
- virt_to_phys(mcpm_entry_point));
+ __pa_symbol(mcpm_entry_point));
ve_spc_cpu_wakeup_irq(cluster, cpu, true);
return 0;
}
static void tc2_pm_cpu_suspend_prepare(unsigned int cpu, unsigned int cluster)
{
- ve_spc_set_resume_addr(cluster, cpu, virt_to_phys(mcpm_entry_point));
+ ve_spc_set_resume_addr(cluster, cpu, __pa_symbol(mcpm_entry_point));
}
static void tc2_pm_cpu_is_up(unsigned int cpu, unsigned int cluster)
* until it receives a soft interrupt, and then the
* secondary CPU branches to this address.
*/
- __raw_writel(virt_to_phys(zx_secondary_startup),
+ __raw_writel(__pa_symbol(zx_secondary_startup),
aonsysctrl_base + AON_SYS_CTRL_RESERVED1);
iounmap(aonsysctrl_base);
/* Map the first 4 KB IRAM for suspend usage */
sys_iram = __arm_ioremap_exec(ZX_IRAM_BASE, PAGE_SIZE, false);
- zx_secondary_startup_pa = virt_to_phys(zx_secondary_startup);
+ zx_secondary_startup_pa = __pa_symbol(zx_secondary_startup);
fncpy(sys_iram, &zx_resume_jump, zx_suspend_iram_sz);
}
static int zynq_boot_secondary(unsigned int cpu, struct task_struct *idle)
{
- return zynq_cpun_start(virt_to_phys(secondary_startup), cpu);
+ return zynq_cpun_start(__pa_symbol(secondary_startup), cpu);
}
/*
select CPU_COPY_V4WT if MMU
select CPU_CP15_MMU
select CPU_PABRT_LEGACY
+ select CPU_THUMB_CAPABLE
select CPU_TLB_V4WT if MMU
help
A 32-bit RISC processor with 8kByte Cache, Write Buffer and
select CPU_CACHE_V4
select CPU_CP15_MPU
select CPU_PABRT_LEGACY
+ select CPU_THUMB_CAPABLE
help
A 32-bit RISC processor with 8KB cache or 4KB variants,
write buffer and MPU(Protection Unit) built around
select CPU_COPY_V4WB if MMU
select CPU_CP15_MMU
select CPU_PABRT_LEGACY
+ select CPU_THUMB_CAPABLE
select CPU_TLB_V4WBI if MMU
help
The ARM920T is licensed to be produced by numerous vendors,
select CPU_COPY_V4WB if MMU
select CPU_CP15_MMU
select CPU_PABRT_LEGACY
+ select CPU_THUMB_CAPABLE
select CPU_TLB_V4WBI if MMU
help
The ARM922T is a version of the ARM920T, but with smaller
select CPU_COPY_V4WB if MMU
select CPU_CP15_MMU
select CPU_PABRT_LEGACY
+ select CPU_THUMB_CAPABLE
select CPU_TLB_V4WBI if MMU
help
The ARM925T is a mix between the ARM920T and ARM926T, but with
select CPU_COPY_V4WB if MMU
select CPU_CP15_MMU
select CPU_PABRT_LEGACY
+ select CPU_THUMB_CAPABLE
select CPU_TLB_V4WBI if MMU
help
This is a variant of the ARM920. It has slightly different
select CPU_CACHE_VIVT
select CPU_CP15_MPU
select CPU_PABRT_LEGACY
+ select CPU_THUMB_CAPABLE
help
ARM940T is a member of the ARM9TDMI family of general-
purpose microprocessors with MPU and separate 4KB
select CPU_CACHE_VIVT
select CPU_CP15_MPU
select CPU_PABRT_LEGACY
+ select CPU_THUMB_CAPABLE
help
ARM946E-S is a member of the ARM9E-S family of high-
performance, 32-bit system-on-chip processor solutions.
select CPU_COPY_V4WB if MMU
select CPU_CP15_MMU
select CPU_PABRT_LEGACY
+ select CPU_THUMB_CAPABLE
select CPU_TLB_V4WBI if MMU
help
The ARM1020 is the 32K cached version of the ARM10 processor,
select CPU_COPY_V4WB if MMU
select CPU_CP15_MMU
select CPU_PABRT_LEGACY
+ select CPU_THUMB_CAPABLE
select CPU_TLB_V4WBI if MMU
# ARM1022E
select CPU_COPY_V4WB if MMU # can probably do better
select CPU_CP15_MMU
select CPU_PABRT_LEGACY
+ select CPU_THUMB_CAPABLE
select CPU_TLB_V4WBI if MMU
help
The ARM1022E is an implementation of the ARMv5TE architecture
select CPU_COPY_V4WB if MMU # can probably do better
select CPU_CP15_MMU
select CPU_PABRT_LEGACY
+ select CPU_THUMB_CAPABLE
select CPU_TLB_V4WBI if MMU
help
The ARM1026EJ-S is an implementation of the ARMv5TEJ architecture
select CPU_CACHE_VIVT
select CPU_CP15_MMU
select CPU_PABRT_LEGACY
+ select CPU_THUMB_CAPABLE
select CPU_TLB_V4WBI if MMU
# XScale Core Version 3
select CPU_CACHE_VIVT
select CPU_CP15_MMU
select CPU_PABRT_LEGACY
+ select CPU_THUMB_CAPABLE
select CPU_TLB_V4WBI if MMU
select IO_36
select CPU_COPY_V4WB if MMU
select CPU_CP15_MMU
select CPU_PABRT_LEGACY
+ select CPU_THUMB_CAPABLE
select CPU_TLB_V4WBI if MMU
# Feroceon
select CPU_COPY_FEROCEON if MMU
select CPU_CP15_MMU
select CPU_PABRT_LEGACY
+ select CPU_THUMB_CAPABLE
select CPU_TLB_FEROCEON if MMU
config CPU_FEROCEON_OLD_ID
select CPU_CP15_MMU
select CPU_HAS_ASID if MMU
select CPU_PABRT_V6
+ select CPU_THUMB_CAPABLE
select CPU_TLB_V6 if MMU
# ARMv6k
select CPU_CP15_MMU
select CPU_HAS_ASID if MMU
select CPU_PABRT_V6
+ select CPU_THUMB_CAPABLE
select CPU_TLB_V6 if MMU
# ARMv7
select CPU_CP15_MPU if !MMU
select CPU_HAS_ASID if MMU
select CPU_PABRT_V7
+ select CPU_THUMB_CAPABLE
select CPU_TLB_V7 if MMU
# ARMv7M
config CPU_THUMBONLY
bool
+ select CPU_THUMB_CAPABLE
# There are no CPUs available with MMU that don't implement an ARM ISA:
depends on !MMU
help
Select this if your CPU doesn't support the 32 bit ARM instructions.
+config CPU_THUMB_CAPABLE
+ bool
+ help
+ Select this if your CPU can support Thumb mode.
+
# Figure out what processor architecture version we should be using.
# This defines the compiler instruction set which depends on the machine type.
config CPU_32v3
config ARM_THUMB
bool "Support Thumb user binaries" if !CPU_THUMBONLY
- depends on CPU_ARM720T || CPU_ARM740T || CPU_ARM920T || CPU_ARM922T || \
- CPU_ARM925T || CPU_ARM926T || CPU_ARM940T || CPU_ARM946E || \
- CPU_ARM1020 || CPU_ARM1020E || CPU_ARM1022 || CPU_ARM1026 || \
- CPU_XSCALE || CPU_XSC3 || CPU_MOHAWK || CPU_V6 || CPU_V6K || \
- CPU_V7 || CPU_FEROCEON || CPU_V7M
+ depends on CPU_THUMB_CAPABLE
default y
help
Say Y if you want to include kernel support for running user space
obj-$(CONFIG_ARM_PTDUMP) += dump.o
obj-$(CONFIG_MODULES) += proc-syms.o
+obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o
obj-$(CONFIG_ALIGNMENT_TRAP) += alignment.o
obj-$(CONFIG_HIGHMEM) += highmem.o
#define pr_fmt(fmt) "uniphier: " fmt
+#include <linux/bitops.h>
#include <linux/init.h>
#include <linux/io.h>
#include <linux/log2.h>
* @ctrl_base: virtual base address of control registers
* @rev_base: virtual base address of revision registers
* @op_base: virtual base address of operation registers
- * @way_present_mask: each bit specifies if the way is present
- * @way_locked_mask: each bit specifies if the way is locked
+ * @way_mask: each bit specifies if the way is present
* @nsets: number of associativity sets
* @line_size: line size in bytes
* @range_op_max_size: max size that can be handled by a single range operation
void __iomem *rev_base;
void __iomem *op_base;
void __iomem *way_ctrl_base;
- u32 way_present_mask;
- u32 way_locked_mask;
+ u32 way_mask;
u32 nsets;
u32 line_size;
u32 range_op_max_size;
writel_relaxed(val, data->ctrl_base + UNIPHIER_SSCC);
}
-static void __init __uniphier_cache_set_locked_ways(
- struct uniphier_cache_data *data,
- u32 way_mask)
+static void __init __uniphier_cache_set_active_ways(
+ struct uniphier_cache_data *data)
{
unsigned int cpu;
- data->way_locked_mask = way_mask & data->way_present_mask;
-
for_each_possible_cpu(cpu)
- writel_relaxed(~data->way_locked_mask & data->way_present_mask,
- data->way_ctrl_base + 4 * cpu);
+ writel_relaxed(data->way_mask, data->way_ctrl_base + 4 * cpu);
}
static void uniphier_cache_maint_range(unsigned long start, unsigned long end,
list_for_each_entry(data, &uniphier_cache_list, list) {
__uniphier_cache_enable(data, true);
- __uniphier_cache_set_locked_ways(data, 0);
+ __uniphier_cache_set_active_ways(data);
}
}
goto err;
}
- data->way_present_mask =
- ((u32)1 << cache_size / data->nsets / data->line_size) - 1;
+ data->way_mask = GENMASK(cache_size / data->nsets / data->line_size - 1,
+ 0);
data->ctrl_base = of_iomap(np, 0);
if (!data->ctrl_base) {
vma->vm_end - vma->vm_start,
vma->vm_page_prot);
}
+#else
+ ret = vm_iomap_memory(vma, vma->vm_start,
+ (vma->vm_end - vma->vm_start));
#endif /* CONFIG_MMU */
return ret;
#include <linux/seq_file.h>
#include <asm/fixmap.h>
+#include <asm/memory.h>
#include <asm/pgtable.h>
struct addr_marker {
{ 0, "vmalloc() Area" },
{ VMALLOC_END, "vmalloc() End" },
{ FIXADDR_START, "Fixmap Area" },
- { CONFIG_VECTORS_BASE, "Vectors" },
- { CONFIG_VECTORS_BASE + PAGE_SIZE * 2, "Vectors End" },
+ { VECTORS_BASE, "Vectors" },
+ { VECTORS_BASE + PAGE_SIZE * 2, "Vectors End" },
{ -1, NULL },
};
if (page == ZERO_PAGE(0))
return;
+ if (!cache_ops_need_broadcast() && cache_is_vipt_nonaliasing()) {
+ if (test_bit(PG_dcache_clean, &page->flags))
+ clear_bit(PG_dcache_clean, &page->flags);
+ return;
+ }
+
mapping = page_mapping(page);
if (!cache_ops_need_broadcast() &&
#include <asm/cp15.h>
#include <asm/mach-types.h>
#include <asm/memblock.h>
+#include <asm/memory.h>
#include <asm/prom.h>
#include <asm/sections.h>
#include <asm/setup.h>
return phys;
}
-void __init arm_memblock_init(const struct machine_desc *mdesc)
+static void __init arm_initrd_init(void)
{
- /* Register the kernel text, kernel data and initrd with memblock. */
-#ifdef CONFIG_XIP_KERNEL
- memblock_reserve(__pa(_sdata), _end - _sdata);
-#else
- memblock_reserve(__pa(_stext), _end - _stext);
-#endif
#ifdef CONFIG_BLK_DEV_INITRD
+ phys_addr_t start;
+ unsigned long size;
+
/* FDT scan will populate initrd_start */
if (initrd_start && !phys_initrd_size) {
phys_initrd_start = __virt_to_phys(initrd_start);
phys_initrd_size = initrd_end - initrd_start;
}
+
initrd_start = initrd_end = 0;
- if (phys_initrd_size &&
- !memblock_is_region_memory(phys_initrd_start, phys_initrd_size)) {
+
+ if (!phys_initrd_size)
+ return;
+
+ /*
+ * Round the memory region to page boundaries as per free_initrd_mem()
+ * This allows us to detect whether the pages overlapping the initrd
+ * are in use, but more importantly, reserves the entire set of pages
+ * as we don't want these pages allocated for other purposes.
+ */
+ start = round_down(phys_initrd_start, PAGE_SIZE);
+ size = phys_initrd_size + (phys_initrd_start - start);
+ size = round_up(size, PAGE_SIZE);
+
+ if (!memblock_is_region_memory(start, size)) {
pr_err("INITRD: 0x%08llx+0x%08lx is not a memory region - disabling initrd\n",
- (u64)phys_initrd_start, phys_initrd_size);
- phys_initrd_start = phys_initrd_size = 0;
+ (u64)start, size);
+ return;
}
- if (phys_initrd_size &&
- memblock_is_region_reserved(phys_initrd_start, phys_initrd_size)) {
+
+ if (memblock_is_region_reserved(start, size)) {
pr_err("INITRD: 0x%08llx+0x%08lx overlaps in-use memory region - disabling initrd\n",
- (u64)phys_initrd_start, phys_initrd_size);
- phys_initrd_start = phys_initrd_size = 0;
+ (u64)start, size);
+ return;
}
- if (phys_initrd_size) {
- memblock_reserve(phys_initrd_start, phys_initrd_size);
- /* Now convert initrd to virtual addresses */
- initrd_start = __phys_to_virt(phys_initrd_start);
- initrd_end = initrd_start + phys_initrd_size;
- }
+ memblock_reserve(start, size);
+
+ /* Now convert initrd to virtual addresses */
+ initrd_start = __phys_to_virt(phys_initrd_start);
+ initrd_end = initrd_start + phys_initrd_size;
#endif
+}
+
+void __init arm_memblock_init(const struct machine_desc *mdesc)
+{
+ /* Register the kernel text, kernel data and initrd with memblock. */
+ memblock_reserve(__pa(KERNEL_START), KERNEL_END - KERNEL_START);
+
+ arm_initrd_init();
arm_mm_memblock_reserve();
" .data : 0x%p" " - 0x%p" " (%4td kB)\n"
" .bss : 0x%p" " - 0x%p" " (%4td kB)\n",
- MLK(UL(CONFIG_VECTORS_BASE), UL(CONFIG_VECTORS_BASE) +
- (PAGE_SIZE)),
+ MLK(VECTORS_BASE, VECTORS_BASE + PAGE_SIZE),
#ifdef CONFIG_HAVE_TCM
MLK(DTCM_OFFSET, (unsigned long) dtcm_end),
MLK(ITCM_OFFSET, (unsigned long) itcm_end),
phys_addr_t arm_lowmem_limit __initdata = 0;
-void __init sanity_check_meminfo(void)
+void __init adjust_lowmem_bounds(void)
{
phys_addr_t memblock_limit = 0;
- int highmem = 0;
u64 vmalloc_limit;
struct memblock_region *reg;
- bool should_use_highmem = false;
+ phys_addr_t lowmem_limit = 0;
/*
* Let's use our own (unoptimized) equivalent of __pa() that is
for_each_memblock(memory, reg) {
phys_addr_t block_start = reg->base;
phys_addr_t block_end = reg->base + reg->size;
- phys_addr_t size_limit = reg->size;
- if (reg->base >= vmalloc_limit)
- highmem = 1;
- else
- size_limit = vmalloc_limit - reg->base;
-
-
- if (!IS_ENABLED(CONFIG_HIGHMEM) || cache_is_vipt_aliasing()) {
-
- if (highmem) {
- pr_notice("Ignoring RAM at %pa-%pa (!CONFIG_HIGHMEM)\n",
- &block_start, &block_end);
- memblock_remove(reg->base, reg->size);
- should_use_highmem = true;
- continue;
- }
-
- if (reg->size > size_limit) {
- phys_addr_t overlap_size = reg->size - size_limit;
-
- pr_notice("Truncating RAM at %pa-%pa",
- &block_start, &block_end);
- block_end = vmalloc_limit;
- pr_cont(" to -%pa", &block_end);
- memblock_remove(vmalloc_limit, overlap_size);
- should_use_highmem = true;
- }
- }
-
- if (!highmem) {
- if (block_end > arm_lowmem_limit) {
- if (reg->size > size_limit)
- arm_lowmem_limit = vmalloc_limit;
- else
- arm_lowmem_limit = block_end;
- }
+ if (reg->base < vmalloc_limit) {
+ if (block_end > lowmem_limit)
+ /*
+ * Compare as u64 to ensure vmalloc_limit does
+ * not get truncated. block_end should always
+ * fit in phys_addr_t so there should be no
+ * issue with assignment.
+ */
+ lowmem_limit = min_t(u64,
+ vmalloc_limit,
+ block_end);
/*
* Find the first non-pmd-aligned page, and point
if (!IS_ALIGNED(block_start, PMD_SIZE))
memblock_limit = block_start;
else if (!IS_ALIGNED(block_end, PMD_SIZE))
- memblock_limit = arm_lowmem_limit;
+ memblock_limit = lowmem_limit;
}
}
}
- if (should_use_highmem)
- pr_notice("Consider using a HIGHMEM enabled kernel.\n");
+ arm_lowmem_limit = lowmem_limit;
high_memory = __va(arm_lowmem_limit - 1) + 1;
if (!memblock_limit)
memblock_limit = arm_lowmem_limit;
+ if (!IS_ENABLED(CONFIG_HIGHMEM) || cache_is_vipt_aliasing()) {
+ if (memblock_end_of_DRAM() > arm_lowmem_limit) {
+ phys_addr_t end = memblock_end_of_DRAM();
+
+ pr_notice("Ignoring RAM at %pa-%pa\n",
+ &memblock_limit, &end);
+ pr_notice("Consider using a HIGHMEM enabled kernel.\n");
+
+ memblock_remove(memblock_limit, end - memblock_limit);
+ }
+ }
+
memblock_set_current_limit(memblock_limit);
}
static void __init map_lowmem(void)
{
struct memblock_region *reg;
-#ifdef CONFIG_XIP_KERNEL
- phys_addr_t kernel_x_start = round_down(__pa(_sdata), SECTION_SIZE);
-#else
- phys_addr_t kernel_x_start = round_down(__pa(_stext), SECTION_SIZE);
-#endif
+ phys_addr_t kernel_x_start = round_down(__pa(KERNEL_START), SECTION_SIZE);
phys_addr_t kernel_x_end = round_up(__pa(__init_end), SECTION_SIZE);
/* Map all the lowmem memory banks. */
#include <linux/kernel.h>
#include <asm/cacheflush.h>
+#include <asm/cp15.h>
#include <asm/sections.h>
#include <asm/page.h>
#include <asm/setup.h>
#include "mm.h"
+unsigned long vectors_base;
+
#ifdef CONFIG_ARM_MPU
struct mpu_rgn_info mpu_rgn_info;
}
/* MPU initialisation functions */
-void __init sanity_check_meminfo_mpu(void)
+void __init adjust_lowmem_bounds_mpu(void)
{
phys_addr_t phys_offset = PHYS_OFFSET;
phys_addr_t aligned_region_size, specified_mem_size, rounded_mem_size;
}
}
#else
-static void sanity_check_meminfo_mpu(void) {}
+static void adjust_lowmem_bounds_mpu(void) {}
static void __init mpu_setup(void) {}
#endif /* CONFIG_ARM_MPU */
+#ifdef CONFIG_CPU_CP15
+#ifdef CONFIG_CPU_HIGH_VECTOR
+static unsigned long __init setup_vectors_base(void)
+{
+ unsigned long reg = get_cr();
+
+ set_cr(reg | CR_V);
+ return 0xffff0000;
+}
+#else /* CONFIG_CPU_HIGH_VECTOR */
+/* Write exception base address to VBAR */
+static inline void set_vbar(unsigned long val)
+{
+ asm("mcr p15, 0, %0, c12, c0, 0" : : "r" (val) : "cc");
+}
+
+/*
+ * Security extensions, bits[7:4], permitted values,
+ * 0b0000 - not implemented, 0b0001/0b0010 - implemented
+ */
+static inline bool security_extensions_enabled(void)
+{
+ return !!cpuid_feature_extract(CPUID_EXT_PFR1, 4);
+}
+
+static unsigned long __init setup_vectors_base(void)
+{
+ unsigned long base = 0, reg = get_cr();
+
+ set_cr(reg & ~CR_V);
+ if (security_extensions_enabled()) {
+ if (IS_ENABLED(CONFIG_REMAP_VECTORS_TO_RAM))
+ base = CONFIG_DRAM_BASE;
+ set_vbar(base);
+ } else if (IS_ENABLED(CONFIG_REMAP_VECTORS_TO_RAM)) {
+ if (CONFIG_DRAM_BASE != 0)
+ pr_err("Security extensions not enabled, vectors cannot be remapped to RAM, vectors base will be 0x00000000\n");
+ }
+
+ return base;
+}
+#endif /* CONFIG_CPU_HIGH_VECTOR */
+#endif /* CONFIG_CPU_CP15 */
+
void __init arm_mm_memblock_reserve(void)
{
#ifndef CONFIG_CPU_V7M
+ vectors_base = IS_ENABLED(CONFIG_CPU_CP15) ? setup_vectors_base() : 0;
/*
* Register the exception vector page.
* some architectures which the DRAM is the exception vector to trap,
* alloc_page breaks with error, although it is not NULL, but "0."
*/
- memblock_reserve(CONFIG_VECTORS_BASE, 2 * PAGE_SIZE);
+ memblock_reserve(vectors_base, 2 * PAGE_SIZE);
#else /* ifndef CONFIG_CPU_V7M */
/*
* There is no dedicated vector page on V7-M. So nothing needs to be
#endif
}
-void __init sanity_check_meminfo(void)
+void __init adjust_lowmem_bounds(void)
{
phys_addr_t end;
- sanity_check_meminfo_mpu();
+ adjust_lowmem_bounds_mpu();
end = memblock_end_of_DRAM();
high_memory = __va(end - 1) + 1;
memblock_set_current_limit(end);
*/
void __init paging_init(const struct machine_desc *mdesc)
{
- early_trap_init((void *)CONFIG_VECTORS_BASE);
+ early_trap_init((void *)vectors_base);
mpu_setup();
bootmem_init();
}
--- /dev/null
+#include <linux/bug.h>
+#include <linux/export.h>
+#include <linux/types.h>
+#include <linux/mmdebug.h>
+#include <linux/mm.h>
+
+#include <asm/sections.h>
+#include <asm/memory.h>
+#include <asm/fixmap.h>
+#include <asm/dma.h>
+
+#include "mm.h"
+
+static inline bool __virt_addr_valid(unsigned long x)
+{
+ /*
+ * high_memory does not get immediately defined, and there
+ * are early callers of __pa() against PAGE_OFFSET
+ */
+ if (!high_memory && x >= PAGE_OFFSET)
+ return true;
+
+ if (high_memory && x >= PAGE_OFFSET && x < (unsigned long)high_memory)
+ return true;
+
+ /*
+ * MAX_DMA_ADDRESS is a virtual address that may not correspond to an
+ * actual physical address. Enough code relies on __pa(MAX_DMA_ADDRESS)
+ * that we just need to work around it and always return true.
+ */
+ if (x == MAX_DMA_ADDRESS)
+ return true;
+
+ return false;
+}
+
+phys_addr_t __virt_to_phys(unsigned long x)
+{
+ WARN(!__virt_addr_valid(x),
+ "virt_to_phys used for non-linear address: %pK (%pS)\n",
+ (void *)x, (void *)x);
+
+ return __virt_to_phys_nodebug(x);
+}
+EXPORT_SYMBOL(__virt_to_phys);
+
+phys_addr_t __phys_addr_symbol(unsigned long x)
+{
+ /* This is bounds checking against the kernel image only.
+ * __pa_symbol should only be used on kernel symbol addresses.
+ */
+ VIRTUAL_BUG_ON(x < (unsigned long)KERNEL_START ||
+ x > (unsigned long)KERNEL_END);
+
+ return __pa_symbol_nodebug(x);
+}
+EXPORT_SYMBOL(__phys_addr_symbol);
"should never happen.\n", vector, smp_processor_id());
}
-__visible void smp_spurious_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs)
{
entering_irq();
__smp_spurious_interrupt(~regs->orig_ax);
exiting_irq();
}
-__visible void smp_trace_spurious_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs)
{
u8 vector = ~regs->orig_ax;
}
-__visible void smp_error_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs)
{
entering_irq();
__smp_error_interrupt(regs);
exiting_irq();
}
-__visible void smp_trace_error_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_trace_error_interrupt(struct pt_regs *regs)
{
entering_irq();
trace_error_apic_entry(ERROR_APIC_VECTOR);
__send_cleanup_vector(data);
}
-asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
+asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void)
{
unsigned vector, me;
deferred_error_int_vector();
}
-asmlinkage __visible void smp_deferred_error_interrupt(void)
+asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(void)
{
entering_irq();
__smp_deferred_error_interrupt();
exiting_ack_irq();
}
-asmlinkage __visible void smp_trace_deferred_error_interrupt(void)
+asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void)
{
entering_irq();
trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
smp_thermal_vector();
}
-asmlinkage __visible void smp_thermal_interrupt(struct pt_regs *regs)
+asmlinkage __visible void __irq_entry
+smp_thermal_interrupt(struct pt_regs *regs)
{
entering_irq();
__smp_thermal_interrupt();
exiting_ack_irq();
}
-asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs)
+asmlinkage __visible void __irq_entry
+smp_trace_thermal_interrupt(struct pt_regs *regs)
{
entering_irq();
trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
mce_threshold_vector();
}
-asmlinkage __visible void smp_threshold_interrupt(void)
+asmlinkage __visible void __irq_entry smp_threshold_interrupt(void)
{
entering_irq();
__smp_threshold_interrupt();
exiting_ack_irq();
}
-asmlinkage __visible void smp_trace_threshold_interrupt(void)
+asmlinkage __visible void __irq_entry smp_trace_threshold_interrupt(void)
{
entering_irq();
trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
x86_platform_ipi_callback();
}
-__visible void smp_x86_platform_ipi(struct pt_regs *regs)
+__visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
}
#endif
-__visible void smp_trace_x86_platform_ipi(struct pt_regs *regs)
+__visible void __irq_entry smp_trace_x86_platform_ipi(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
#include <linux/hardirq.h>
#include <asm/apic.h>
#include <asm/trace/irq_vectors.h>
+#include <linux/interrupt.h>
static inline void __smp_irq_work_interrupt(void)
{
irq_work_run();
}
-__visible void smp_irq_work_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs)
{
ipi_entering_ack_irq();
__smp_irq_work_interrupt();
exiting_irq();
}
-__visible void smp_trace_irq_work_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_trace_irq_work_interrupt(struct pt_regs *regs)
{
ipi_entering_ack_irq();
trace_irq_work_entry(IRQ_WORK_VECTOR);
/* 0 means: find the address automatically */
if (crash_base <= 0) {
/*
- * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX
+ * Set CRASH_ADDR_LOW_MAX upper bound for crash memory,
+ * as old kexec-tools loads bzImage below that, unless
+ * "crashkernel=size[KMG],high" is specified.
*/
crash_base = memblock_find_in_range(CRASH_ALIGN,
high ? CRASH_ADDR_HIGH_MAX
scheduler_ipi();
}
-__visible void smp_reschedule_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs)
{
ack_APIC_irq();
__smp_reschedule_interrupt();
*/
}
-__visible void smp_trace_reschedule_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_trace_reschedule_interrupt(struct pt_regs *regs)
{
/*
* Need to call irq_enter() before calling the trace point.
inc_irq_stat(irq_call_count);
}
-__visible void smp_call_function_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_call_function_interrupt(struct pt_regs *regs)
{
ipi_entering_ack_irq();
__smp_call_function_interrupt();
exiting_irq();
}
-__visible void smp_trace_call_function_interrupt(struct pt_regs *regs)
+__visible void __irq_entry
+smp_trace_call_function_interrupt(struct pt_regs *regs)
{
ipi_entering_ack_irq();
trace_call_function_entry(CALL_FUNCTION_VECTOR);
inc_irq_stat(irq_call_count);
}
-__visible void smp_call_function_single_interrupt(struct pt_regs *regs)
+__visible void __irq_entry
+smp_call_function_single_interrupt(struct pt_regs *regs)
{
ipi_entering_ack_irq();
__smp_call_function_single_interrupt();
exiting_irq();
}
-__visible void smp_trace_call_function_single_interrupt(struct pt_regs *regs)
+__visible void __irq_entry
+smp_trace_call_function_single_interrupt(struct pt_regs *regs)
{
ipi_entering_ack_irq();
trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
/DISCARD/ : {
*(.eh_frame)
*(__func_stack_frame_non_standard)
+ *(__unreachable)
}
}
#define RBD_FEATURE_LAYERING (1<<0)
#define RBD_FEATURE_STRIPINGV2 (1<<1)
#define RBD_FEATURE_EXCLUSIVE_LOCK (1<<2)
+#define RBD_FEATURE_DATA_POOL (1<<7)
#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
RBD_FEATURE_STRIPINGV2 | \
- RBD_FEATURE_EXCLUSIVE_LOCK)
+ RBD_FEATURE_EXCLUSIVE_LOCK | \
+ RBD_FEATURE_DATA_POOL)
/* Features supported by this (client software) implementation. */
/* These six fields never change for a given rbd image */
char *object_prefix;
__u8 obj_order;
- __u8 crypt_type;
- __u8 comp_type;
u64 stripe_unit;
u64 stripe_count;
+ s64 data_pool_id;
u64 features; /* Might be changeable someday? */
/* The remaining fields need to be updated occasionally */
};
struct rbd_obj_request {
- const char *object_name;
+ u64 object_no;
u64 offset; /* object start byte */
u64 length; /* bytes from offset */
unsigned long flags;
static struct kmem_cache *rbd_img_request_cache;
static struct kmem_cache *rbd_obj_request_cache;
-static struct kmem_cache *rbd_segment_name_cache;
static int rbd_major;
static DEFINE_IDA(rbd_dev_id_ida);
return true;
}
+/*
+ * returns the size of an object in the image
+ */
+static u32 rbd_obj_bytes(struct rbd_image_header *header)
+{
+ return 1U << header->obj_order;
+}
+
+static void rbd_init_layout(struct rbd_device *rbd_dev)
+{
+ if (rbd_dev->header.stripe_unit == 0 ||
+ rbd_dev->header.stripe_count == 0) {
+ rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
+ rbd_dev->header.stripe_count = 1;
+ }
+
+ rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
+ rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
+ rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
+ rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
+ rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
+ RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
+}
+
/*
* Fill an rbd image header with information from the given format 1
* on-disk header.
/* Allocate this now to avoid having to handle failure below */
if (first_time) {
- size_t len;
-
- len = strnlen(ondisk->object_prefix,
- sizeof (ondisk->object_prefix));
- object_prefix = kmalloc(len + 1, GFP_KERNEL);
+ object_prefix = kstrndup(ondisk->object_prefix,
+ sizeof(ondisk->object_prefix),
+ GFP_KERNEL);
if (!object_prefix)
return -ENOMEM;
- memcpy(object_prefix, ondisk->object_prefix, len);
- object_prefix[len] = '\0';
}
/* Allocate the snapshot context and fill it in */
if (first_time) {
header->object_prefix = object_prefix;
header->obj_order = ondisk->options.order;
- header->crypt_type = ondisk->options.crypt_type;
- header->comp_type = ondisk->options.comp_type;
- /* The rest aren't used for format 1 images */
- header->stripe_unit = 0;
- header->stripe_count = 0;
- header->features = 0;
+ rbd_init_layout(rbd_dev);
} else {
ceph_put_snap_context(header->snapc);
kfree(header->snap_names);
rbd_dev->mapping.features = 0;
}
-static void rbd_segment_name_free(const char *name)
-{
- /* The explicit cast here is needed to drop the const qualifier */
-
- kmem_cache_free(rbd_segment_name_cache, (void *)name);
-}
-
-static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
-{
- char *name;
- u64 segment;
- int ret;
- char *name_format;
-
- name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
- if (!name)
- return NULL;
- segment = offset >> rbd_dev->header.obj_order;
- name_format = "%s.%012llx";
- if (rbd_dev->image_format == 2)
- name_format = "%s.%016llx";
- ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format,
- rbd_dev->header.object_prefix, segment);
- if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
- pr_err("error formatting segment name for #%llu (%d)\n",
- segment, ret);
- rbd_segment_name_free(name);
- name = NULL;
- }
-
- return name;
-}
-
static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
{
- u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
+ u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
return offset & (segment_size - 1);
}
static u64 rbd_segment_length(struct rbd_device *rbd_dev,
u64 offset, u64 length)
{
- u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
+ u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
offset &= segment_size - 1;
return length;
}
-/*
- * returns the size of an object in the image
- */
-static u64 rbd_obj_bytes(struct rbd_image_header *header)
-{
- return 1 << header->obj_order;
-}
-
/*
* bio helpers
*/
{
struct ceph_osd_request *osd_req = obj_request->osd_req;
- dout("%s %p osd_req %p\n", __func__, obj_request, osd_req);
+ dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__,
+ obj_request, obj_request->object_no, obj_request->offset,
+ obj_request->length, osd_req);
if (obj_request_img_data_test(obj_request)) {
WARN_ON(obj_request->callback != rbd_img_obj_callback);
rbd_img_request_get(obj_request->img_request);
ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
}
-static void rbd_obj_request_end(struct rbd_obj_request *obj_request)
-{
- dout("%s %p\n", __func__, obj_request);
- ceph_osdc_cancel_request(obj_request->osd_req);
-}
-
-/*
- * Wait for an object request to complete. If interrupted, cancel the
- * underlying osd request.
- *
- * @timeout: in jiffies, 0 means "wait forever"
- */
-static int __rbd_obj_request_wait(struct rbd_obj_request *obj_request,
- unsigned long timeout)
-{
- long ret;
-
- dout("%s %p\n", __func__, obj_request);
- ret = wait_for_completion_interruptible_timeout(
- &obj_request->completion,
- ceph_timeout_jiffies(timeout));
- if (ret <= 0) {
- if (ret == 0)
- ret = -ETIMEDOUT;
- rbd_obj_request_end(obj_request);
- } else {
- ret = 0;
- }
-
- dout("%s %p ret %d\n", __func__, obj_request, (int)ret);
- return ret;
-}
-
-static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
-{
- return __rbd_obj_request_wait(obj_request, 0);
-}
-
static void rbd_img_request_complete(struct rbd_img_request *img_request)
{
rbd_osd_call_callback(obj_request);
break;
default:
- rbd_warn(NULL, "%s: unsupported op %hu",
- obj_request->object_name, (unsigned short) opcode);
+ rbd_warn(NULL, "unexpected OSD op: object_no %016llx opcode %d",
+ obj_request->object_no, opcode);
break;
}
osd_req->r_data_offset = obj_request->offset;
}
+static struct ceph_osd_request *
+__rbd_osd_req_create(struct rbd_device *rbd_dev,
+ struct ceph_snap_context *snapc,
+ int num_ops, unsigned int flags,
+ struct rbd_obj_request *obj_request)
+{
+ struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+ struct ceph_osd_request *req;
+ const char *name_format = rbd_dev->image_format == 1 ?
+ RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
+
+ req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
+ if (!req)
+ return NULL;
+
+ req->r_flags = flags;
+ req->r_callback = rbd_osd_req_callback;
+ req->r_priv = obj_request;
+
+ req->r_base_oloc.pool = rbd_dev->layout.pool_id;
+ if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
+ rbd_dev->header.object_prefix, obj_request->object_no))
+ goto err_req;
+
+ if (ceph_osdc_alloc_messages(req, GFP_NOIO))
+ goto err_req;
+
+ return req;
+
+err_req:
+ ceph_osdc_put_request(req);
+ return NULL;
+}
+
/*
* Create an osd request. A read request has one osd op (read).
* A write request has either one (watch) or two (hint+write) osd ops.
struct rbd_obj_request *obj_request)
{
struct ceph_snap_context *snapc = NULL;
- struct ceph_osd_client *osdc;
- struct ceph_osd_request *osd_req;
if (obj_request_img_data_test(obj_request) &&
(op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) {
rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2));
- /* Allocate and initialize the request, for the num_ops ops */
-
- osdc = &rbd_dev->rbd_client->client->osdc;
- osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
- GFP_NOIO);
- if (!osd_req)
- goto fail;
-
- if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
- osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
- else
- osd_req->r_flags = CEPH_OSD_FLAG_READ;
-
- osd_req->r_callback = rbd_osd_req_callback;
- osd_req->r_priv = obj_request;
-
- osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id;
- if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
- obj_request->object_name))
- goto fail;
-
- if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
- goto fail;
-
- return osd_req;
-
-fail:
- ceph_osdc_put_request(osd_req);
- return NULL;
+ return __rbd_osd_req_create(rbd_dev, snapc, num_ops,
+ (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) ?
+ CEPH_OSD_FLAG_WRITE : CEPH_OSD_FLAG_READ, obj_request);
}
/*
rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
{
struct rbd_img_request *img_request;
- struct ceph_snap_context *snapc;
- struct rbd_device *rbd_dev;
- struct ceph_osd_client *osdc;
- struct ceph_osd_request *osd_req;
int num_osd_ops = 3;
rbd_assert(obj_request_img_data_test(obj_request));
if (img_request_discard_test(img_request))
num_osd_ops = 2;
- /* Allocate and initialize the request, for all the ops */
-
- snapc = img_request->snapc;
- rbd_dev = img_request->rbd_dev;
- osdc = &rbd_dev->rbd_client->client->osdc;
- osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops,
- false, GFP_NOIO);
- if (!osd_req)
- goto fail;
-
- osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
- osd_req->r_callback = rbd_osd_req_callback;
- osd_req->r_priv = obj_request;
-
- osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id;
- if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
- obj_request->object_name))
- goto fail;
-
- if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
- goto fail;
-
- return osd_req;
-
-fail:
- ceph_osdc_put_request(osd_req);
- return NULL;
+ return __rbd_osd_req_create(img_request->rbd_dev,
+ img_request->snapc, num_osd_ops,
+ CEPH_OSD_FLAG_WRITE, obj_request);
}
-
static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
{
ceph_osdc_put_request(osd_req);
}
-/* object_name is assumed to be a non-null pointer and NUL-terminated */
-
-static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
- u64 offset, u64 length,
- enum obj_request_type type)
+static struct rbd_obj_request *
+rbd_obj_request_create(enum obj_request_type type)
{
struct rbd_obj_request *obj_request;
- size_t size;
- char *name;
rbd_assert(obj_request_type_valid(type));
- size = strlen(object_name) + 1;
- name = kmalloc(size, GFP_NOIO);
- if (!name)
- return NULL;
-
obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
- if (!obj_request) {
- kfree(name);
+ if (!obj_request)
return NULL;
- }
- obj_request->object_name = memcpy(name, object_name, size);
- obj_request->offset = offset;
- obj_request->length = length;
- obj_request->flags = 0;
obj_request->which = BAD_WHICH;
obj_request->type = type;
INIT_LIST_HEAD(&obj_request->links);
init_completion(&obj_request->completion);
kref_init(&obj_request->kref);
- dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
- offset, length, (int)type, obj_request);
-
+ dout("%s %p\n", __func__, obj_request);
return obj_request;
}
break;
}
- kfree(obj_request->object_name);
- obj_request->object_name = NULL;
kmem_cache_free(rbd_obj_request_cache, obj_request);
}
while (resid) {
struct ceph_osd_request *osd_req;
- const char *object_name;
- u64 offset;
- u64 length;
+ u64 object_no = img_offset >> rbd_dev->header.obj_order;
+ u64 offset = rbd_segment_offset(rbd_dev, img_offset);
+ u64 length = rbd_segment_length(rbd_dev, img_offset, resid);
- object_name = rbd_segment_name(rbd_dev, img_offset);
- if (!object_name)
- goto out_unwind;
- offset = rbd_segment_offset(rbd_dev, img_offset);
- length = rbd_segment_length(rbd_dev, img_offset, resid);
- obj_request = rbd_obj_request_create(object_name,
- offset, length, type);
- /* object request has its own copy of the object name */
- rbd_segment_name_free(object_name);
+ obj_request = rbd_obj_request_create(type);
if (!obj_request)
goto out_unwind;
+ obj_request->object_no = object_no;
+ obj_request->offset = offset;
+ obj_request->length = length;
+
/*
* set obj_request->img_request before creating the
* osd_request so that it gets the right snapc
* child image to which the original request was to be sent.
*/
img_offset = obj_request->img_offset - obj_request->offset;
- length = (u64)1 << rbd_dev->header.obj_order;
+ length = rbd_obj_bytes(&rbd_dev->header);
/*
* There is no defined parent data beyond the parent
size_t size;
int ret;
- stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
- OBJ_REQUEST_PAGES);
+ stat_request = rbd_obj_request_create(OBJ_REQUEST_PAGES);
if (!stat_request)
return -ENOMEM;
+ stat_request->object_no = obj_request->object_no;
+
stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
stat_request);
if (!stat_request->osd_req) {
* returned in the outbound buffer, or a negative error code.
*/
static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
- const char *object_name,
- const char *class_name,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
const char *method_name,
const void *outbound,
size_t outbound_size,
void *inbound,
size_t inbound_size)
{
- struct rbd_obj_request *obj_request;
- struct page **pages;
- u32 page_count;
+ struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+ struct page *req_page = NULL;
+ struct page *reply_page;
int ret;
/*
* method. Currently if this is present it will be a
* snapshot id.
*/
- page_count = (u32)calc_pages_for(0, inbound_size);
- pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
- if (IS_ERR(pages))
- return PTR_ERR(pages);
-
- ret = -ENOMEM;
- obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
- OBJ_REQUEST_PAGES);
- if (!obj_request)
- goto out;
+ if (outbound) {
+ if (outbound_size > PAGE_SIZE)
+ return -E2BIG;
- obj_request->pages = pages;
- obj_request->page_count = page_count;
-
- obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
- obj_request);
- if (!obj_request->osd_req)
- goto out;
-
- osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
- class_name, method_name);
- if (outbound_size) {
- struct ceph_pagelist *pagelist;
-
- pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
- if (!pagelist)
- goto out;
+ req_page = alloc_page(GFP_KERNEL);
+ if (!req_page)
+ return -ENOMEM;
- ceph_pagelist_init(pagelist);
- ceph_pagelist_append(pagelist, outbound, outbound_size);
- osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
- pagelist);
+ memcpy(page_address(req_page), outbound, outbound_size);
}
- osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
- obj_request->pages, inbound_size,
- 0, false, false);
-
- rbd_obj_request_submit(obj_request);
- ret = rbd_obj_request_wait(obj_request);
- if (ret)
- goto out;
- ret = obj_request->result;
- if (ret < 0)
- goto out;
+ reply_page = alloc_page(GFP_KERNEL);
+ if (!reply_page) {
+ if (req_page)
+ __free_page(req_page);
+ return -ENOMEM;
+ }
- rbd_assert(obj_request->xferred < (u64)INT_MAX);
- ret = (int)obj_request->xferred;
- ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
-out:
- if (obj_request)
- rbd_obj_request_put(obj_request);
- else
- ceph_release_page_vector(pages, page_count);
+ ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
+ CEPH_OSD_FLAG_READ, req_page, outbound_size,
+ reply_page, &inbound_size);
+ if (!ret) {
+ memcpy(inbound, page_address(reply_page), inbound_size);
+ ret = inbound_size;
+ }
+ if (req_page)
+ __free_page(req_page);
+ __free_page(reply_page);
return ret;
}
}
static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
- const char *object_name,
- u64 offset, u64 length, void *buf)
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ void *buf, int buf_len)
{
- struct rbd_obj_request *obj_request;
- struct page **pages = NULL;
- u32 page_count;
- size_t size;
+ struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+ struct ceph_osd_request *req;
+ struct page **pages;
+ int num_pages = calc_pages_for(0, buf_len);
int ret;
- page_count = (u32) calc_pages_for(offset, length);
- pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
- if (IS_ERR(pages))
- return PTR_ERR(pages);
-
- ret = -ENOMEM;
- obj_request = rbd_obj_request_create(object_name, offset, length,
- OBJ_REQUEST_PAGES);
- if (!obj_request)
- goto out;
-
- obj_request->pages = pages;
- obj_request->page_count = page_count;
-
- obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
- obj_request);
- if (!obj_request->osd_req)
- goto out;
+ req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
- osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
- offset, length, 0, 0);
- osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
- obj_request->pages,
- obj_request->length,
- obj_request->offset & ~PAGE_MASK,
- false, false);
+ ceph_oid_copy(&req->r_base_oid, oid);
+ ceph_oloc_copy(&req->r_base_oloc, oloc);
+ req->r_flags = CEPH_OSD_FLAG_READ;
- rbd_obj_request_submit(obj_request);
- ret = rbd_obj_request_wait(obj_request);
+ ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
if (ret)
- goto out;
+ goto out_req;
- ret = obj_request->result;
- if (ret < 0)
- goto out;
+ pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+ if (IS_ERR(pages)) {
+ ret = PTR_ERR(pages);
+ goto out_req;
+ }
- rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
- size = (size_t) obj_request->xferred;
- ceph_copy_from_page_vector(pages, buf, 0, size);
- rbd_assert(size <= (size_t)INT_MAX);
- ret = (int)size;
-out:
- if (obj_request)
- rbd_obj_request_put(obj_request);
- else
- ceph_release_page_vector(pages, page_count);
+ osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
+ osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
+ true);
+
+ ceph_osdc_start_request(osdc, req, false);
+ ret = ceph_osdc_wait_request(osdc, req);
+ if (ret >= 0)
+ ceph_copy_from_page_vector(pages, buf, 0, ret);
+out_req:
+ ceph_osdc_put_request(req);
return ret;
}
if (!ondisk)
return -ENOMEM;
- ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_oid.name,
- 0, size, ondisk);
+ ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
+ &rbd_dev->header_oloc, ondisk, size);
if (ret < 0)
goto out;
if ((size_t)ret < size) {
static void rbd_dev_release(struct device *dev);
-static struct device_type rbd_device_type = {
+static const struct device_type rbd_device_type = {
.name = "rbd",
.groups = rbd_attr_groups,
.release = rbd_dev_release,
INIT_LIST_HEAD(&rbd_dev->node);
init_rwsem(&rbd_dev->header_rwsem);
+ rbd_dev->header.data_pool_id = CEPH_NOPOOL;
ceph_oid_init(&rbd_dev->header_oid);
- ceph_oloc_init(&rbd_dev->header_oloc);
+ rbd_dev->header_oloc.pool = spec->pool_id;
mutex_init(&rbd_dev->watch_mutex);
rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
rbd_dev->rbd_client = rbdc;
rbd_dev->spec = spec;
- rbd_dev->layout.stripe_unit = 1 << RBD_MAX_OBJ_ORDER;
- rbd_dev->layout.stripe_count = 1;
- rbd_dev->layout.object_size = 1 << RBD_MAX_OBJ_ORDER;
- rbd_dev->layout.pool_id = spec->pool_id;
- RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
-
return rbd_dev;
}
__le64 size;
} __attribute__ ((packed)) size_buf = { 0 };
- ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
- "rbd", "get_size",
- &snapid, sizeof (snapid),
- &size_buf, sizeof (size_buf));
+ ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+ &rbd_dev->header_oloc, "get_size",
+ &snapid, sizeof(snapid),
+ &size_buf, sizeof(size_buf));
dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
if (ret < 0)
return ret;
if (!reply_buf)
return -ENOMEM;
- ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
- "rbd", "get_object_prefix", NULL, 0,
- reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
+ ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+ &rbd_dev->header_oloc, "get_object_prefix",
+ NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
if (ret < 0)
goto out;
u64 unsup;
int ret;
- ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
- "rbd", "get_features",
- &snapid, sizeof (snapid),
- &features_buf, sizeof (features_buf));
+ ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+ &rbd_dev->header_oloc, "get_features",
+ &snapid, sizeof(snapid),
+ &features_buf, sizeof(features_buf));
dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
if (ret < 0)
return ret;
}
snapid = cpu_to_le64(rbd_dev->spec->snap_id);
- ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
- "rbd", "get_parent",
- &snapid, sizeof (snapid),
- reply_buf, size);
+ ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+ &rbd_dev->header_oloc, "get_parent",
+ &snapid, sizeof(snapid), reply_buf, size);
dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
if (ret < 0)
goto out_err;
u64 stripe_count;
int ret;
- ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
- "rbd", "get_stripe_unit_count", NULL, 0,
- (char *)&striping_info_buf, size);
+ ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+ &rbd_dev->header_oloc, "get_stripe_unit_count",
+ NULL, 0, &striping_info_buf, size);
dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
if (ret < 0)
return ret;
* out, and only fail if the image has non-default values.
*/
ret = -EINVAL;
- obj_size = (u64)1 << rbd_dev->header.obj_order;
+ obj_size = rbd_obj_bytes(&rbd_dev->header);
p = &striping_info_buf;
stripe_unit = ceph_decode_64(&p);
if (stripe_unit != obj_size) {
return 0;
}
+static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
+{
+ __le64 data_pool_id;
+ int ret;
+
+ ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+ &rbd_dev->header_oloc, "get_data_pool",
+ NULL, 0, &data_pool_id, sizeof(data_pool_id));
+ if (ret < 0)
+ return ret;
+ if (ret < sizeof(data_pool_id))
+ return -EBADMSG;
+
+ rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
+ WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
+ return 0;
+}
+
static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
{
+ CEPH_DEFINE_OID_ONSTACK(oid);
size_t image_id_size;
char *image_id;
void *p;
if (!reply_buf)
goto out;
- ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
- "rbd", "dir_get_name",
- image_id, image_id_size,
- reply_buf, size);
+ ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
+ ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
+ "dir_get_name", image_id, image_id_size,
+ reply_buf, size);
if (ret < 0)
goto out;
p = reply_buf;
if (!reply_buf)
return -ENOMEM;
- ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
- "rbd", "get_snapcontext", NULL, 0,
- reply_buf, size);
+ ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+ &rbd_dev->header_oloc, "get_snapcontext",
+ NULL, 0, reply_buf, size);
dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
if (ret < 0)
goto out;
return ERR_PTR(-ENOMEM);
snapid = cpu_to_le64(snap_id);
- ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
- "rbd", "get_snapshot_name",
- &snapid, sizeof (snapid),
- reply_buf, size);
+ ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+ &rbd_dev->header_oloc, "get_snapshot_name",
+ &snapid, sizeof(snapid), reply_buf, size);
dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
if (ret < 0) {
snap_name = ERR_PTR(ret);
{
int ret;
size_t size;
- char *object_name;
+ CEPH_DEFINE_OID_ONSTACK(oid);
void *response;
char *image_id;
* First, see if the format 2 image id file exists, and if
* so, get the image's persistent id from it.
*/
- size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
- object_name = kmalloc(size, GFP_NOIO);
- if (!object_name)
- return -ENOMEM;
- sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
- dout("rbd id object name is %s\n", object_name);
+ ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
+ rbd_dev->spec->image_name);
+ if (ret)
+ return ret;
+
+ dout("rbd id object name is %s\n", oid.name);
/* Response will be an encoded string, which includes a length */
/* If it doesn't exist we'll assume it's a format 1 image */
- ret = rbd_obj_method_sync(rbd_dev, object_name,
- "rbd", "get_id", NULL, 0,
- response, RBD_IMAGE_ID_LEN_MAX);
+ ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
+ "get_id", NULL, 0,
+ response, RBD_IMAGE_ID_LEN_MAX);
dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
if (ret == -ENOENT) {
image_id = kstrdup("", GFP_KERNEL);
}
out:
kfree(response);
- kfree(object_name);
-
+ ceph_oid_destroy(&oid);
return ret;
}
if (ret < 0)
goto out_err;
}
- /* No support for crypto and compression type format 2 images */
+ if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
+ ret = rbd_dev_v2_data_pool(rbd_dev);
+ if (ret)
+ goto out_err;
+ }
+
+ rbd_init_layout(rbd_dev);
return 0;
+
out_err:
rbd_dev->header.features = 0;
kfree(rbd_dev->header.object_prefix);
rbd_dev->header.object_prefix = NULL;
-
return ret;
}
/* Record the header object name for this rbd image. */
rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
-
- rbd_dev->header_oloc.pool = rbd_dev->layout.pool_id;
if (rbd_dev->image_format == 1)
ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
spec->image_name, RBD_SUFFIX);
if (!rbd_obj_request_cache)
goto out_err;
- rbd_assert(!rbd_segment_name_cache);
- rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
- CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL);
- if (rbd_segment_name_cache)
- return 0;
-out_err:
- kmem_cache_destroy(rbd_obj_request_cache);
- rbd_obj_request_cache = NULL;
+ return 0;
+out_err:
kmem_cache_destroy(rbd_img_request_cache);
rbd_img_request_cache = NULL;
-
return -ENOMEM;
}
static void rbd_slab_exit(void)
{
- rbd_assert(rbd_segment_name_cache);
- kmem_cache_destroy(rbd_segment_name_cache);
- rbd_segment_name_cache = NULL;
-
rbd_assert(rbd_obj_request_cache);
kmem_cache_destroy(rbd_obj_request_cache);
rbd_obj_request_cache = NULL;
*/
#define RBD_HEADER_PREFIX "rbd_header."
-#define RBD_DATA_PREFIX "rbd_data."
#define RBD_ID_PREFIX "rbd_id."
+#define RBD_V2_DATA_FORMAT "%s.%016llx"
#define RBD_LOCK_NAME "rbd_lock"
#define RBD_LOCK_TAG "internal"
/*
* For format version 1, rbd image 'foo' consists of objects
* foo.rbd - image metadata
- * rb.<idhi>.<idlo>.00000000
- * rb.<idhi>.<idlo>.00000001
+ * rb.<idhi>.<idlo>.<extra>.000000000000
+ * rb.<idhi>.<idlo>.<extra>.000000000001
* ... - data
* There is no notion of a persistent image id in rbd format 1.
*/
#define RBD_SUFFIX ".rbd"
+#define RBD_V1_DATA_FORMAT "%s.%012llx"
#define RBD_DIRECTORY "rbd_directory"
#define RBD_INFO "rbd_info"
#define RBD_MIN_OBJ_ORDER 16
#define RBD_MAX_OBJ_ORDER 30
-#define RBD_COMP_NONE 0
-#define RBD_CRYPT_NONE 0
-
#define RBD_HEADER_TEXT "<<< Rados Block Device Image >>>\n"
#define RBD_HEADER_SIGNATURE "RBD"
#define RBD_HEADER_VERSION "001.005"
.cable_detect = palm_bk3710_cable_detect,
};
-static struct ide_port_info palm_bk3710_port_info = {
+static struct ide_port_info palm_bk3710_port_info __initdata = {
.init_dma = palm_bk3710_init_dma,
.port_ops = &palm_bk3710_ports_ops,
.dma_ops = &sff_dma_ops,
/* blob */
#define NUM_BLOB_BLOCKS FLASH_NUMBLOCKS_16m_PARAM
-#define BLOB_START 0x00000000
-#define BLOB_LEN (NUM_BLOB_BLOCKS * FLASH_BLOCKSIZE_PARAM)
+#define PART_BLOB_START 0x00000000
+#define PART_BLOB_LEN (NUM_BLOB_BLOCKS * FLASH_BLOCKSIZE_PARAM)
/* kernel */
#define NUM_KERNEL_BLOCKS 7
-#define KERNEL_START (BLOB_START + BLOB_LEN)
-#define KERNEL_LEN (NUM_KERNEL_BLOCKS * FLASH_BLOCKSIZE_MAIN)
+#define PART_KERNEL_START (PART_BLOB_START + PART_BLOB_LEN)
+#define PART_KERNEL_LEN (NUM_KERNEL_BLOCKS * FLASH_BLOCKSIZE_MAIN)
/* initial ramdisk */
#define NUM_INITRD_BLOCKS 24
-#define INITRD_START (KERNEL_START + KERNEL_LEN)
-#define INITRD_LEN (NUM_INITRD_BLOCKS * FLASH_BLOCKSIZE_MAIN)
+#define PART_INITRD_START (PART_KERNEL_START + PART_KERNEL_LEN)
+#define PART_INITRD_LEN (NUM_INITRD_BLOCKS * FLASH_BLOCKSIZE_MAIN)
/*
* See section 4.0 in "3 Volt Fast Boot Block Flash Memory" Intel Datasheet
/* blob */
{
.name = "blob",
- .offset = BLOB_START,
- .size = BLOB_LEN,
+ .offset = PART_BLOB_START,
+ .size = PART_BLOB_LEN,
},
/* kernel */
{
.name = "kernel",
- .offset = KERNEL_START, /* MTDPART_OFS_APPEND */
- .size = KERNEL_LEN,
+ .offset = PART_KERNEL_START, /* MTDPART_OFS_APPEND */
+ .size = PART_KERNEL_LEN,
},
/* initial ramdisk / file system */
{
.name = "file system",
- .offset = INITRD_START, /* MTDPART_OFS_APPEND */
- .size = INITRD_LEN, /* MTDPART_SIZ_FULL */
+ .offset = PART_INITRD_START, /* MTDPART_OFS_APPEND */
+ .size = PART_INITRD_LEN, /* MTDPART_SIZ_FULL */
}
};
#define NUM_PARTITIONS ARRAY_SIZE(lart_partitions)
static int xgene_enet_setup_mss(struct net_device *ndev, u32 mss)
{
struct xgene_enet_pdata *pdata = netdev_priv(ndev);
- bool mss_index_found = false;
- int mss_index;
+ int mss_index = -EBUSY;
int i;
spin_lock(&pdata->mss_lock);
/* Reuse the slot if MSS matches */
- for (i = 0; !mss_index_found && i < NUM_MSS_REG; i++) {
+ for (i = 0; mss_index < 0 && i < NUM_MSS_REG; i++) {
if (pdata->mss[i] == mss) {
pdata->mss_refcnt[i]++;
mss_index = i;
- mss_index_found = true;
}
}
/* Overwrite the slot with ref_count = 0 */
- for (i = 0; !mss_index_found && i < NUM_MSS_REG; i++) {
+ for (i = 0; mss_index < 0 && i < NUM_MSS_REG; i++) {
if (!pdata->mss_refcnt[i]) {
pdata->mss_refcnt[i]++;
pdata->mac_ops->set_mss(pdata, mss, i);
pdata->mss[i] = mss;
mss_index = i;
- mss_index_found = true;
}
}
- /* No slots with ref_count = 0 available, return busy */
- if (!mss_index_found)
- mss_index = -EBUSY;
-
spin_unlock(&pdata->mss_lock);
return mss_index;
}
}
+#define MLX4_EN_WRAP_AROUND_SEC 10UL
+/* By scheduling the overflow check every 5 seconds, we have a reasonably
+ * good chance we wont miss a wrap around.
+ * TOTO: Use a timer instead of a work queue to increase the guarantee.
+ */
+#define MLX4_EN_OVERFLOW_PERIOD (MLX4_EN_WRAP_AROUND_SEC * HZ / 2)
+
void mlx4_en_ptp_overflow_check(struct mlx4_en_dev *mdev)
{
bool timeout = time_is_before_jiffies(mdev->last_overflow_check +
- mdev->overflow_period);
+ MLX4_EN_OVERFLOW_PERIOD);
unsigned long flags;
if (timeout) {
.enable = mlx4_en_phc_enable,
};
-#define MLX4_EN_WRAP_AROUND_SEC 10ULL
/* This function calculates the max shift that enables the user range
* of MLX4_EN_WRAP_AROUND_SEC values in the cycles register.
{
struct mlx4_dev *dev = mdev->dev;
unsigned long flags;
- u64 ns, zero = 0;
/* mlx4_en_init_timestamp is called for each netdev.
* mdev->ptp_clock is common for all ports, skip initialization if
ktime_to_ns(ktime_get_real()));
write_sequnlock_irqrestore(&mdev->clock_lock, flags);
- /* Calculate period in seconds to call the overflow watchdog - to make
- * sure counter is checked at least once every wrap around.
- */
- ns = cyclecounter_cyc2ns(&mdev->cycles, mdev->cycles.mask, zero, &zero);
- do_div(ns, NSEC_PER_SEC / 2 / HZ);
- mdev->overflow_period = ns;
-
/* Configure the PHC */
mdev->ptp_clock_info = mlx4_en_ptp_clock_info;
snprintf(mdev->ptp_clock_info.name, 16, "mlx4 ptp");
seqlock_t clock_lock;
struct timecounter clock;
unsigned long last_overflow_check;
- unsigned long overflow_period;
struct ptp_clock *ptp_clock;
struct ptp_clock_info ptp_clock_info;
struct notifier_block nb;
* s2io_nic structure.
* @regs : pointer to the structure with parameters given by ethtool for
* dumping the registers.
- * @reg_space: The input argumnet into which all the registers are dumped.
+ * @reg_space: The input argument into which all the registers are dumped.
* Description:
* Dumps the entire register space of xFrame NIC into the user given
* buffer area.
* @dev: device pointer.
* @regs: pointer to the structure with parameters given by ethtool for
* dumping the registers.
- * @reg_space: The input argumnet into which all the registers are dumped.
+ * @reg_space: The input argument into which all the registers are dumped.
*
* Dumps the vpath register space of Titan NIC into the user given
* buffer area.
#define OOO_LB_TC 9
int qed_configure_vport_wfq(struct qed_dev *cdev, u16 vp_id, u32 rate);
-void qed_configure_vp_wfq_on_link_change(struct qed_dev *cdev, u32 min_pf_rate);
+void qed_configure_vp_wfq_on_link_change(struct qed_dev *cdev,
+ struct qed_ptt *p_ptt,
+ u32 min_pf_rate);
void qed_clean_wfq_db(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
#define QED_LEADING_HWFN(dev) (&dev->hwfns[0])
}
/* API to configure WFQ from mcp link change */
-void qed_configure_vp_wfq_on_link_change(struct qed_dev *cdev, u32 min_pf_rate)
+void qed_configure_vp_wfq_on_link_change(struct qed_dev *cdev,
+ struct qed_ptt *p_ptt, u32 min_pf_rate)
{
int i;
for_each_hwfn(cdev, i) {
struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
- __qed_configure_vp_wfq_on_link_change(p_hwfn,
- p_hwfn->p_dpc_ptt,
+ __qed_configure_vp_wfq_on_link_change(p_hwfn, p_ptt,
min_pf_rate);
}
}
/* Min bandwidth configuration */
__qed_configure_pf_min_bandwidth(p_hwfn, p_ptt, p_link, min_bw);
- qed_configure_vp_wfq_on_link_change(p_hwfn->cdev, p_link->min_pf_rate);
+ qed_configure_vp_wfq_on_link_change(p_hwfn->cdev, p_ptt,
+ p_link->min_pf_rate);
p_link->an = !!(status & LINK_STATUS_AUTO_NEGOTIATE_ENABLED);
p_link->an_complete = !!(status &
ack_vfs[vfid / 32] |= BIT((vfid % 32));
p_hwfn->pf_iov_info->pending_flr[rel_vf_id / 64] &=
~(1ULL << (rel_vf_id % 64));
- p_hwfn->pf_iov_info->pending_events[rel_vf_id / 64] &=
- ~(1ULL << (rel_vf_id % 64));
+ p_vf->vf_mbx.b_pending_msg = false;
}
return rc;
mbx = &p_vf->vf_mbx;
/* qed_iov_process_mbx_request */
- DP_VERBOSE(p_hwfn, QED_MSG_IOV,
- "VF[%02x]: Processing mailbox message\n", p_vf->abs_vf_id);
+ if (!mbx->b_pending_msg) {
+ DP_NOTICE(p_hwfn,
+ "VF[%02x]: Trying to process mailbox message when none is pending\n",
+ p_vf->abs_vf_id);
+ return;
+ }
+ mbx->b_pending_msg = false;
mbx->first_tlv = mbx->req_virt->first_tlv;
+ DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+ "VF[%02x]: Processing mailbox message [type %04x]\n",
+ p_vf->abs_vf_id, mbx->first_tlv.tl.type);
+
/* check if tlv type is known */
if (qed_iov_tlv_supported(mbx->first_tlv.tl.type) &&
!p_vf->b_malicious) {
}
}
-static void qed_iov_pf_add_pending_events(struct qed_hwfn *p_hwfn, u8 vfid)
+void qed_iov_pf_get_pending_events(struct qed_hwfn *p_hwfn, u64 *events)
{
- u64 add_bit = 1ULL << (vfid % 64);
+ int i;
- p_hwfn->pf_iov_info->pending_events[vfid / 64] |= add_bit;
-}
+ memset(events, 0, sizeof(u64) * QED_VF_ARRAY_LENGTH);
-static void qed_iov_pf_get_and_clear_pending_events(struct qed_hwfn *p_hwfn,
- u64 *events)
-{
- u64 *p_pending_events = p_hwfn->pf_iov_info->pending_events;
+ qed_for_each_vf(p_hwfn, i) {
+ struct qed_vf_info *p_vf;
- memcpy(events, p_pending_events, sizeof(u64) * QED_VF_ARRAY_LENGTH);
- memset(p_pending_events, 0, sizeof(u64) * QED_VF_ARRAY_LENGTH);
+ p_vf = &p_hwfn->pf_iov_info->vfs_array[i];
+ if (p_vf->vf_mbx.b_pending_msg)
+ events[i / 64] |= 1ULL << (i % 64);
+ }
}
static struct qed_vf_info *qed_sriov_get_vf_from_absid(struct qed_hwfn *p_hwfn,
p_vf->vf_mbx.pending_req = (((u64)vf_msg->hi) << 32) | vf_msg->lo;
/* Mark the event and schedule the workqueue */
- qed_iov_pf_add_pending_events(p_hwfn, p_vf->relative_vf_id);
+ p_vf->vf_mbx.b_pending_msg = true;
qed_schedule_iov(p_hwfn, QED_IOV_WQ_MSG_FLAG);
return 0;
return;
}
- qed_iov_pf_get_and_clear_pending_events(hwfn, events);
+ qed_iov_pf_get_pending_events(hwfn, events);
DP_VERBOSE(hwfn, QED_MSG_IOV,
"Event mask of VF events: 0x%llx 0x%llx 0x%llx\n",
/* Address in VF where a pending message is located */
dma_addr_t pending_req;
+ /* Message from VF awaits handling */
+ bool b_pending_msg;
+
u8 *offset;
/* saved VF request header */
*/
struct qed_pf_iov {
struct qed_vf_info vfs_array[MAX_NUM_VFS];
- u64 pending_events[QED_VF_ARRAY_LENGTH];
u64 pending_flr[QED_VF_ARRAY_LENGTH];
/* Allocate message address continuosuly and split to each VF */
/* Configure the AXI Bus Mode Register */
void (*axi)(void __iomem *ioaddr, struct stmmac_axi *axi);
/* Dump DMA registers */
- void (*dump_regs) (void __iomem *ioaddr);
+ void (*dump_regs)(void __iomem *ioaddr, u32 *reg_space);
/* Set tx/rx threshold in the csr6 register
* An invalid value enables the store-and-forward mode */
void (*dma_mode)(void __iomem *ioaddr, int txmode, int rxmode,
/* Enable RX Queues */
void (*rx_queue_enable)(struct mac_device_info *hw, u32 queue);
/* Dump MAC registers */
- void (*dump_regs)(struct mac_device_info *hw);
+ void (*dump_regs)(struct mac_device_info *hw, u32 *reg_space);
/* Handle extra events on specific interrupts hw dependent */
int (*host_irq_status)(struct mac_device_info *hw,
struct stmmac_extra_stats *x);
return !!(value & GMAC_CONTROL_IPC);
}
-static void dwmac1000_dump_regs(struct mac_device_info *hw)
+static void dwmac1000_dump_regs(struct mac_device_info *hw, u32 *reg_space)
{
void __iomem *ioaddr = hw->pcsr;
int i;
- pr_info("\tDWMAC1000 regs (base addr = 0x%p)\n", ioaddr);
- for (i = 0; i < 55; i++) {
- int offset = i * 4;
- pr_info("\tReg No. %d (offset 0x%x): 0x%08x\n", i,
- offset, readl(ioaddr + offset));
- }
+ for (i = 0; i < 55; i++)
+ reg_space[i] = readl(ioaddr + i * 4);
}
static void dwmac1000_set_umac_addr(struct mac_device_info *hw,
writel(csr6, ioaddr + DMA_CONTROL);
}
-static void dwmac1000_dump_dma_regs(void __iomem *ioaddr)
+static void dwmac1000_dump_dma_regs(void __iomem *ioaddr, u32 *reg_space)
{
int i;
- pr_info(" DMA registers\n");
- for (i = 0; i < 22; i++) {
- if ((i < 9) || (i > 17)) {
- int offset = i * 4;
- pr_err("\t Reg No. %d (offset 0x%x): 0x%08x\n", i,
- (DMA_BUS_MODE + offset),
- readl(ioaddr + DMA_BUS_MODE + offset));
- }
- }
+
+ for (i = 0; i < 22; i++)
+ if ((i < 9) || (i > 17))
+ reg_space[DMA_BUS_MODE / 4 + i] =
+ readl(ioaddr + DMA_BUS_MODE + i * 4);
}
static void dwmac1000_get_hw_feature(void __iomem *ioaddr,
#endif
}
-static void dwmac100_dump_mac_regs(struct mac_device_info *hw)
+static void dwmac100_dump_mac_regs(struct mac_device_info *hw, u32 *reg_space)
{
void __iomem *ioaddr = hw->pcsr;
- pr_info("\t----------------------------------------------\n"
- "\t DWMAC 100 CSR (base addr = 0x%p)\n"
- "\t----------------------------------------------\n", ioaddr);
- pr_info("\tcontrol reg (offset 0x%x): 0x%08x\n", MAC_CONTROL,
- readl(ioaddr + MAC_CONTROL));
- pr_info("\taddr HI (offset 0x%x): 0x%08x\n ", MAC_ADDR_HIGH,
- readl(ioaddr + MAC_ADDR_HIGH));
- pr_info("\taddr LO (offset 0x%x): 0x%08x\n", MAC_ADDR_LOW,
- readl(ioaddr + MAC_ADDR_LOW));
- pr_info("\tmulticast hash HI (offset 0x%x): 0x%08x\n",
- MAC_HASH_HIGH, readl(ioaddr + MAC_HASH_HIGH));
- pr_info("\tmulticast hash LO (offset 0x%x): 0x%08x\n",
- MAC_HASH_LOW, readl(ioaddr + MAC_HASH_LOW));
- pr_info("\tflow control (offset 0x%x): 0x%08x\n",
- MAC_FLOW_CTRL, readl(ioaddr + MAC_FLOW_CTRL));
- pr_info("\tVLAN1 tag (offset 0x%x): 0x%08x\n", MAC_VLAN1,
- readl(ioaddr + MAC_VLAN1));
- pr_info("\tVLAN2 tag (offset 0x%x): 0x%08x\n", MAC_VLAN2,
- readl(ioaddr + MAC_VLAN2));
+
+ reg_space[MAC_CONTROL / 4] = readl(ioaddr + MAC_CONTROL);
+ reg_space[MAC_ADDR_HIGH / 4] = readl(ioaddr + MAC_ADDR_HIGH);
+ reg_space[MAC_ADDR_LOW / 4] = readl(ioaddr + MAC_ADDR_LOW);
+ reg_space[MAC_HASH_HIGH / 4] = readl(ioaddr + MAC_HASH_HIGH);
+ reg_space[MAC_HASH_LOW / 4] = readl(ioaddr + MAC_HASH_LOW);
+ reg_space[MAC_FLOW_CTRL / 4] = readl(ioaddr + MAC_FLOW_CTRL);
+ reg_space[MAC_VLAN1 / 4] = readl(ioaddr + MAC_VLAN1);
+ reg_space[MAC_VLAN2 / 4] = readl(ioaddr + MAC_VLAN2);
}
static int dwmac100_rx_ipc_enable(struct mac_device_info *hw)
writel(csr6, ioaddr + DMA_CONTROL);
}
-static void dwmac100_dump_dma_regs(void __iomem *ioaddr)
+static void dwmac100_dump_dma_regs(void __iomem *ioaddr, u32 *reg_space)
{
int i;
- pr_debug("DWMAC 100 DMA CSR\n");
for (i = 0; i < 9; i++)
- pr_debug("\t CSR%d (offset 0x%x): 0x%08x\n", i,
- (DMA_BUS_MODE + i * 4),
- readl(ioaddr + DMA_BUS_MODE + i * 4));
+ reg_space[DMA_BUS_MODE / 4 + i] =
+ readl(ioaddr + DMA_BUS_MODE + i * 4);
- pr_debug("\tCSR20 (0x%x): 0x%08x, CSR21 (0x%x): 0x%08x\n",
- DMA_CUR_TX_BUF_ADDR, readl(ioaddr + DMA_CUR_TX_BUF_ADDR),
- DMA_CUR_RX_BUF_ADDR, readl(ioaddr + DMA_CUR_RX_BUF_ADDR));
+ reg_space[DMA_CUR_TX_BUF_ADDR / 4] =
+ readl(ioaddr + DMA_CUR_TX_BUF_ADDR);
+ reg_space[DMA_CUR_RX_BUF_ADDR / 4] =
+ readl(ioaddr + DMA_CUR_RX_BUF_ADDR);
}
/* DMA controller has two counters to track the number of the missed frames. */
writel(value, ioaddr + GMAC_RXQ_CTRL0);
}
-static void dwmac4_dump_regs(struct mac_device_info *hw)
+static void dwmac4_dump_regs(struct mac_device_info *hw, u32 *reg_space)
{
void __iomem *ioaddr = hw->pcsr;
int i;
- pr_debug("\tDWMAC4 regs (base addr = 0x%p)\n", ioaddr);
-
- for (i = 0; i < GMAC_REG_NUM; i++) {
- int offset = i * 4;
-
- pr_debug("\tReg No. %d (offset 0x%x): 0x%08x\n", i,
- offset, readl(ioaddr + offset));
- }
+ for (i = 0; i < GMAC_REG_NUM; i++)
+ reg_space[i] = readl(ioaddr + i * 4);
}
static int dwmac4_rx_ipc_enable(struct mac_device_info *hw)
dwmac4_dma_init_channel(ioaddr, dma_cfg, dma_tx, dma_rx, i);
}
-static void _dwmac4_dump_dma_regs(void __iomem *ioaddr, u32 channel)
+static void _dwmac4_dump_dma_regs(void __iomem *ioaddr, u32 channel,
+ u32 *reg_space)
{
- pr_debug(" Channel %d\n", channel);
- pr_debug("\tDMA_CHAN_CONTROL, offset: 0x%x, val: 0x%x\n", 0,
- readl(ioaddr + DMA_CHAN_CONTROL(channel)));
- pr_debug("\tDMA_CHAN_TX_CONTROL, offset: 0x%x, val: 0x%x\n", 0x4,
- readl(ioaddr + DMA_CHAN_TX_CONTROL(channel)));
- pr_debug("\tDMA_CHAN_RX_CONTROL, offset: 0x%x, val: 0x%x\n", 0x8,
- readl(ioaddr + DMA_CHAN_RX_CONTROL(channel)));
- pr_debug("\tDMA_CHAN_TX_BASE_ADDR, offset: 0x%x, val: 0x%x\n", 0x14,
- readl(ioaddr + DMA_CHAN_TX_BASE_ADDR(channel)));
- pr_debug("\tDMA_CHAN_RX_BASE_ADDR, offset: 0x%x, val: 0x%x\n", 0x1c,
- readl(ioaddr + DMA_CHAN_RX_BASE_ADDR(channel)));
- pr_debug("\tDMA_CHAN_TX_END_ADDR, offset: 0x%x, val: 0x%x\n", 0x20,
- readl(ioaddr + DMA_CHAN_TX_END_ADDR(channel)));
- pr_debug("\tDMA_CHAN_RX_END_ADDR, offset: 0x%x, val: 0x%x\n", 0x28,
- readl(ioaddr + DMA_CHAN_RX_END_ADDR(channel)));
- pr_debug("\tDMA_CHAN_TX_RING_LEN, offset: 0x%x, val: 0x%x\n", 0x2c,
- readl(ioaddr + DMA_CHAN_TX_RING_LEN(channel)));
- pr_debug("\tDMA_CHAN_RX_RING_LEN, offset: 0x%x, val: 0x%x\n", 0x30,
- readl(ioaddr + DMA_CHAN_RX_RING_LEN(channel)));
- pr_debug("\tDMA_CHAN_INTR_ENA, offset: 0x%x, val: 0x%x\n", 0x34,
- readl(ioaddr + DMA_CHAN_INTR_ENA(channel)));
- pr_debug("\tDMA_CHAN_RX_WATCHDOG, offset: 0x%x, val: 0x%x\n", 0x38,
- readl(ioaddr + DMA_CHAN_RX_WATCHDOG(channel)));
- pr_debug("\tDMA_CHAN_SLOT_CTRL_STATUS, offset: 0x%x, val: 0x%x\n", 0x3c,
- readl(ioaddr + DMA_CHAN_SLOT_CTRL_STATUS(channel)));
- pr_debug("\tDMA_CHAN_CUR_TX_DESC, offset: 0x%x, val: 0x%x\n", 0x44,
- readl(ioaddr + DMA_CHAN_CUR_TX_DESC(channel)));
- pr_debug("\tDMA_CHAN_CUR_RX_DESC, offset: 0x%x, val: 0x%x\n", 0x4c,
- readl(ioaddr + DMA_CHAN_CUR_RX_DESC(channel)));
- pr_debug("\tDMA_CHAN_CUR_TX_BUF_ADDR, offset: 0x%x, val: 0x%x\n", 0x54,
- readl(ioaddr + DMA_CHAN_CUR_TX_BUF_ADDR(channel)));
- pr_debug("\tDMA_CHAN_CUR_RX_BUF_ADDR, offset: 0x%x, val: 0x%x\n", 0x5c,
- readl(ioaddr + DMA_CHAN_CUR_RX_BUF_ADDR(channel)));
- pr_debug("\tDMA_CHAN_STATUS, offset: 0x%x, val: 0x%x\n", 0x60,
- readl(ioaddr + DMA_CHAN_STATUS(channel)));
+ reg_space[DMA_CHAN_CONTROL(channel) / 4] =
+ readl(ioaddr + DMA_CHAN_CONTROL(channel));
+ reg_space[DMA_CHAN_TX_CONTROL(channel) / 4] =
+ readl(ioaddr + DMA_CHAN_TX_CONTROL(channel));
+ reg_space[DMA_CHAN_RX_CONTROL(channel) / 4] =
+ readl(ioaddr + DMA_CHAN_RX_CONTROL(channel));
+ reg_space[DMA_CHAN_TX_BASE_ADDR(channel) / 4] =
+ readl(ioaddr + DMA_CHAN_TX_BASE_ADDR(channel));
+ reg_space[DMA_CHAN_RX_BASE_ADDR(channel) / 4] =
+ readl(ioaddr + DMA_CHAN_RX_BASE_ADDR(channel));
+ reg_space[DMA_CHAN_TX_END_ADDR(channel) / 4] =
+ readl(ioaddr + DMA_CHAN_TX_END_ADDR(channel));
+ reg_space[DMA_CHAN_RX_END_ADDR(channel) / 4] =
+ readl(ioaddr + DMA_CHAN_RX_END_ADDR(channel));
+ reg_space[DMA_CHAN_TX_RING_LEN(channel) / 4] =
+ readl(ioaddr + DMA_CHAN_TX_RING_LEN(channel));
+ reg_space[DMA_CHAN_RX_RING_LEN(channel) / 4] =
+ readl(ioaddr + DMA_CHAN_RX_RING_LEN(channel));
+ reg_space[DMA_CHAN_INTR_ENA(channel) / 4] =
+ readl(ioaddr + DMA_CHAN_INTR_ENA(channel));
+ reg_space[DMA_CHAN_RX_WATCHDOG(channel) / 4] =
+ readl(ioaddr + DMA_CHAN_RX_WATCHDOG(channel));
+ reg_space[DMA_CHAN_SLOT_CTRL_STATUS(channel) / 4] =
+ readl(ioaddr + DMA_CHAN_SLOT_CTRL_STATUS(channel));
+ reg_space[DMA_CHAN_CUR_TX_DESC(channel) / 4] =
+ readl(ioaddr + DMA_CHAN_CUR_TX_DESC(channel));
+ reg_space[DMA_CHAN_CUR_RX_DESC(channel) / 4] =
+ readl(ioaddr + DMA_CHAN_CUR_RX_DESC(channel));
+ reg_space[DMA_CHAN_CUR_TX_BUF_ADDR(channel) / 4] =
+ readl(ioaddr + DMA_CHAN_CUR_TX_BUF_ADDR(channel));
+ reg_space[DMA_CHAN_CUR_RX_BUF_ADDR(channel) / 4] =
+ readl(ioaddr + DMA_CHAN_CUR_RX_BUF_ADDR(channel));
+ reg_space[DMA_CHAN_STATUS(channel) / 4] =
+ readl(ioaddr + DMA_CHAN_STATUS(channel));
}
-static void dwmac4_dump_dma_regs(void __iomem *ioaddr)
+static void dwmac4_dump_dma_regs(void __iomem *ioaddr, u32 *reg_space)
{
int i;
- pr_debug(" GMAC4 DMA registers\n");
-
for (i = 0; i < DMA_CHANNEL_NB_MAX; i++)
- _dwmac4_dump_dma_regs(ioaddr, i);
+ _dwmac4_dump_dma_regs(ioaddr, i, reg_space);
}
static void dwmac4_rx_watchdog(void __iomem *ioaddr, u32 riwt)
static void stmmac_ethtool_gregs(struct net_device *dev,
struct ethtool_regs *regs, void *space)
{
- int i;
u32 *reg_space = (u32 *) space;
struct stmmac_priv *priv = netdev_priv(dev);
memset(reg_space, 0x0, REG_SPACE_SIZE);
- if (priv->plat->has_gmac || priv->plat->has_gmac4) {
- /* MAC registers */
- for (i = 0; i < 55; i++)
- reg_space[i] = readl(priv->ioaddr + (i * 4));
- /* DMA registers */
- for (i = 0; i < 22; i++)
- reg_space[i + 55] =
- readl(priv->ioaddr + (DMA_BUS_MODE + (i * 4)));
- } else {
- /* MAC registers */
- for (i = 0; i < 12; i++)
- reg_space[i] = readl(priv->ioaddr + (i * 4));
- /* DMA registers */
- for (i = 0; i < 9; i++)
- reg_space[i + 12] =
- readl(priv->ioaddr + (DMA_BUS_MODE + (i * 4)));
- reg_space[22] = readl(priv->ioaddr + DMA_CUR_TX_BUF_ADDR);
- reg_space[23] = readl(priv->ioaddr + DMA_CUR_RX_BUF_ADDR);
- }
+ priv->hw->mac->dump_regs(priv->hw, reg_space);
+ priv->hw->dma->dump_regs(priv->ioaddr, reg_space);
}
static void
priv->hw->dma->start_tx(priv->ioaddr);
priv->hw->dma->start_rx(priv->ioaddr);
- /* Dump DMA/MAC registers */
- if (netif_msg_hw(priv)) {
- priv->hw->mac->dump_regs(priv->hw);
- priv->hw->dma->dump_regs(priv->ioaddr);
- }
priv->tx_lpi_timer = STMMAC_DEFAULT_TWT_LS;
if ((priv->use_riwt) && (priv->hw->dma->rx_watchdog)) {
*/
int phy_aneg_done(struct phy_device *phydev)
{
- if (phydev->drv->aneg_done)
+ if (phydev->drv && phydev->drv->aneg_done)
return phydev->drv->aneg_done(phydev);
return genphy_aneg_done(phydev);
const struct iphdr *old_iph = ip_hdr(skb);
union vxlan_addr *dst;
union vxlan_addr remote_ip, local_ip;
- union vxlan_addr *src;
struct vxlan_metadata _md;
struct vxlan_metadata *md = &_md;
__be16 src_port = 0, dst_port;
dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port;
vni = (rdst->remote_vni) ? : default_vni;
- src = &vxlan->cfg.saddr;
+ local_ip = vxlan->cfg.saddr;
dst_cache = &rdst->dst_cache;
md->gbp = skb->mark;
ttl = vxlan->cfg.ttl;
dst = &remote_ip;
dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;
vni = tunnel_id_to_key32(info->key.tun_id);
- src = &local_ip;
dst_cache = &info->dst_cache;
if (info->options_len)
md = ip_tunnel_info_opts(info);
rt = vxlan_get_route(vxlan, dev, sock4, skb,
rdst ? rdst->remote_ifindex : 0, tos,
dst->sin.sin_addr.s_addr,
- &src->sin.sin_addr.s_addr,
+ &local_ip.sin.sin_addr.s_addr,
dst_port, src_port,
dst_cache, info);
if (IS_ERR(rt)) {
if (err < 0)
goto tx_error;
- udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, src->sin.sin_addr.s_addr,
+ udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, local_ip.sin.sin_addr.s_addr,
dst->sin.sin_addr.s_addr, tos, ttl, df,
src_port, dst_port, xnet, !udp_sum);
#if IS_ENABLED(CONFIG_IPV6)
ndst = vxlan6_get_route(vxlan, dev, sock6, skb,
rdst ? rdst->remote_ifindex : 0, tos,
label, &dst->sin6.sin6_addr,
- &src->sin6.sin6_addr,
+ &local_ip.sin6.sin6_addr,
dst_port, src_port,
dst_cache, info);
if (IS_ERR(ndst)) {
goto tx_error;
udp_tunnel6_xmit_skb(ndst, sock6->sock->sk, skb, dev,
- &src->sin6.sin6_addr,
+ &local_ip.sin6.sin6_addr,
&dst->sin6.sin6_addr, tos, ttl,
label, src_port, dst_port, !udp_sum);
#endif
if (data[IFLA_VXLAN_ID]) {
__u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);
- if (id >= VXLAN_VID_MASK)
+ if (id >= VXLAN_N_VID)
return -ERANGE;
}
nr_pages = i;
if (nr_pages > 0) {
len = nr_pages << PAGE_SHIFT;
+ osd_req_op_extent_update(req, 0, len);
break;
}
goto out_pages;
wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
(wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
- if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+ if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
if (ci->i_wrbuffer_ref > 0) {
pr_warn_ratelimited(
"writepage_start %p %lld forced umount\n",
&ci->i_layout, vino,
offset, &len, 0, num_ops,
CEPH_OSD_OP_WRITE,
- CEPH_OSD_FLAG_WRITE |
- CEPH_OSD_FLAG_ONDISK,
+ CEPH_OSD_FLAG_WRITE,
snapc, truncate_seq,
truncate_size, false);
if (IS_ERR(req)) {
min(num_ops,
CEPH_OSD_SLAB_OPS),
CEPH_OSD_OP_WRITE,
- CEPH_OSD_FLAG_WRITE |
- CEPH_OSD_FLAG_ONDISK,
+ CEPH_OSD_FLAG_WRITE,
snapc, truncate_seq,
truncate_size, true);
BUG_ON(IS_ERR(req));
int r;
struct ceph_snap_context *snapc, *oldest;
- if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+ if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
dout(" page %p forced umount\n", page);
unlock_page(page);
return -EIO;
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
ceph_vino(inode), 0, &len, 0, 1,
- CEPH_OSD_OP_CREATE,
- CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
+ CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_WRITE,
NULL, 0, 0, false);
if (IS_ERR(req)) {
err = PTR_ERR(req);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
ceph_vino(inode), 0, &len, 1, 3,
- CEPH_OSD_OP_WRITE,
- CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
+ CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
NULL, ci->i_truncate_seq,
ci->i_truncate_size, false);
if (IS_ERR(req)) {
goto out_unlock;
}
- wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK;
+ wr_req->r_flags = CEPH_OSD_FLAG_WRITE;
osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
fscache_enable_cookie(ci->fscache, ceph_fscache_can_enable,
inode);
if (fscache_cookie_enabled(ci->fscache)) {
- dout("fscache_file_set_cookie %p %p enabing cache\n",
+ dout("fscache_file_set_cookie %p %p enabling cache\n",
inode, filp);
}
}
/*
* Return caps we have registered with the MDS(s) as 'wanted'.
*/
-int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
+int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
{
struct ceph_cap *cap;
struct rb_node *p;
for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
cap = rb_entry(p, struct ceph_cap, ci_node);
- if (!__cap_is_valid(cap))
+ if (check && !__cap_is_valid(cap))
continue;
if (cap == ci->i_auth_cap)
mds_wanted |= cap->mds_wanted;
delayed = 1;
}
ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
+ if (want & ~cap->mds_wanted) {
+ /* user space may open/close single file frequently.
+ * This avoids droping mds_wanted immediately after
+ * requesting new mds_wanted.
+ */
+ __cap_set_timeouts(mdsc, ci);
+ }
cap->issued &= retain; /* drop bits we don't want */
if (cap->implemented & ~cap->issued) {
dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
- ceph_sync_write_wait(inode);
-
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (ret < 0)
goto out;
if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
int mds_wanted;
- if (ACCESS_ONCE(mdsc->fsc->mount_state) ==
+ if (READ_ONCE(mdsc->fsc->mount_state) ==
CEPH_MOUNT_SHUTDOWN) {
dout("get_cap_refs %p forced umount\n", inode);
*err = -EIO;
ret = 1;
goto out_unlock;
}
- mds_wanted = __ceph_caps_mds_wanted(ci);
- if ((mds_wanted & need) != need) {
+ mds_wanted = __ceph_caps_mds_wanted(ci, false);
+ if (need & ~(mds_wanted & need)) {
dout("get_cap_refs %p caps were dropped"
" (session killed?)\n", inode);
*err = -ESTALE;
ret = 1;
goto out_unlock;
}
- if ((mds_wanted & file_wanted) ==
- (file_wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
+ if (!(file_wanted & ~mds_wanted))
ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
}
tcap->implemented |= issued;
if (cap == ci->i_auth_cap)
ci->i_auth_cap = tcap;
+
if (!list_empty(&ci->i_cap_flush_list) &&
ci->i_auth_cap == tcap) {
spin_lock(&mdsc->cap_dirty_lock);
} else if (tsession) {
/* add placeholder for the export tagert */
int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
+ tcap = new_cap;
ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
+ if (!list_empty(&ci->i_cap_flush_list) &&
+ ci->i_auth_cap == tcap) {
+ spin_lock(&mdsc->cap_dirty_lock);
+ list_move_tail(&ci->i_flushing_item,
+ &tcap->session->s_cap_flushing);
+ spin_unlock(&mdsc->cap_dirty_lock);
+ }
+
__ceph_remove_cap(cap, false);
goto out_unlock;
}
}
int ceph_encode_dentry_release(void **p, struct dentry *dentry,
+ struct inode *dir,
int mds, int drop, int unless)
{
- struct inode *dir = d_inode(dentry->d_parent);
+ struct dentry *parent = NULL;
struct ceph_mds_request_release *rel = *p;
struct ceph_dentry_info *di = ceph_dentry(dentry);
int force = 0;
spin_lock(&dentry->d_lock);
if (di->lease_session && di->lease_session->s_mds == mds)
force = 1;
+ if (!dir) {
+ parent = dget(dentry->d_parent);
+ dir = d_inode(parent);
+ }
spin_unlock(&dentry->d_lock);
ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
+ dput(parent);
spin_lock(&dentry->d_lock);
if (ret && di->lease_session && di->lease_session->s_mds == mds) {
seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
- if (req->r_got_unsafe)
+ if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
seq_puts(s, "\t(unsafe)");
else
seq_puts(s, "\t");
/* hints to request -> mds selection code */
req->r_direct_mode = USE_AUTH_MDS;
req->r_direct_hash = ceph_frag_value(frag);
- req->r_direct_is_hash = true;
+ __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
if (fi->last_name) {
req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL);
if (!req->r_path2) {
fi->frag = frag;
fi->last_readdir = req;
- if (req->r_did_prepopulate) {
+ if (test_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags)) {
fi->readdir_cache_idx = req->r_readdir_cache_idx;
if (fi->readdir_cache_idx < 0) {
/* preclude from marking dir ordered */
mask |= CEPH_CAP_XATTR_SHARED;
req->r_args.getattr.mask = cpu_to_le32(mask);
- req->r_locked_dir = dir;
+ req->r_parent = dir;
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
err = ceph_mdsc_do_request(mdsc, NULL, req);
err = ceph_handle_snapdir(req, dentry, err);
dentry = ceph_finish_lookup(req, dentry, err);
}
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
- req->r_locked_dir = dir;
+ req->r_parent = dir;
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_args.mknod.mode = cpu_to_le32(mode);
req->r_args.mknod.rdev = cpu_to_le32(rdev);
req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
ceph_mdsc_put_request(req);
goto out;
}
- req->r_locked_dir = dir;
+ req->r_parent = dir;
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
- req->r_locked_dir = dir;
+ req->r_parent = dir;
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_args.mkdir.mode = cpu_to_le32(mode);
req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
req->r_old_dentry = dget(old_dentry);
- req->r_locked_dir = dir;
+ req->r_parent = dir;
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
/* release LINK_SHARED on source inode (mds will lock it) */
}
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
- req->r_locked_dir = dir;
+ req->r_parent = dir;
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
req->r_inode_drop = drop_caps_for_unlink(inode);
req->r_num_caps = 2;
req->r_old_dentry = dget(old_dentry);
req->r_old_dentry_dir = old_dir;
- req->r_locked_dir = new_dir;
+ req->r_parent = new_dir;
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
struct inode *dir;
if (flags & LOOKUP_RCU) {
- parent = ACCESS_ONCE(dentry->d_parent);
+ parent = READ_ONCE(dentry->d_parent);
dir = d_inode_rcu(parent);
if (!dir)
return -ECHILD;
return -ECHILD;
op = ceph_snap(dir) == CEPH_SNAPDIR ?
- CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_GETATTR;
+ CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
if (!IS_ERR(req)) {
req->r_dentry = dget(dentry);
- req->r_num_caps = op == CEPH_MDS_OP_GETATTR ? 1 : 2;
+ req->r_num_caps = 2;
+ req->r_parent = dir;
mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
if (ceph_security_xattr_wanted(dir))
req->r_inode = d_inode(child);
ihold(d_inode(child));
req->r_ino2 = ceph_vino(d_inode(parent));
- req->r_locked_dir = d_inode(parent);
+ req->r_parent = d_inode(parent);
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_num_caps = 2;
err = ceph_mdsc_do_request(mdsc, NULL, req);
spin_lock(&ci->i_ceph_lock);
if (__ceph_is_any_real_caps(ci) &&
(((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
- int mds_wanted = __ceph_caps_mds_wanted(ci);
+ int mds_wanted = __ceph_caps_mds_wanted(ci, true);
int issued = __ceph_caps_issued(ci, NULL);
dout("open %p fmode %d want %s issued %s using existing\n",
mask |= CEPH_CAP_XATTR_SHARED;
req->r_args.open.mask = cpu_to_le32(mask);
- req->r_locked_dir = dir; /* caller holds dir->i_mutex */
+ req->r_parent = dir;
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
err = ceph_mdsc_do_request(mdsc,
(flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
req);
goto out;
}
- req->r_flags = CEPH_OSD_FLAG_ORDERSNAP |
- CEPH_OSD_FLAG_ONDISK |
- CEPH_OSD_FLAG_WRITE;
+ req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE;
ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
kfree(aio_work);
}
-/*
- * Write commit request unsafe callback, called to tell us when a
- * request is unsafe (that is, in flight--has been handed to the
- * messenger to send to its target osd). It is called again when
- * we've received a response message indicating the request is
- * "safe" (its CEPH_OSD_FLAG_ONDISK flag is set), or when a request
- * is completed early (and unsuccessfully) due to a timeout or
- * interrupt.
- *
- * This is used if we requested both an ACK and ONDISK commit reply
- * from the OSD.
- */
-static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
-{
- struct ceph_inode_info *ci = ceph_inode(req->r_inode);
-
- dout("%s %p tid %llu %ssafe\n", __func__, req, req->r_tid,
- unsafe ? "un" : "");
- if (unsafe) {
- ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
- spin_lock(&ci->i_unsafe_lock);
- list_add_tail(&req->r_unsafe_item,
- &ci->i_unsafe_writes);
- spin_unlock(&ci->i_unsafe_lock);
-
- complete_all(&req->r_completion);
- } else {
- spin_lock(&ci->i_unsafe_lock);
- list_del_init(&req->r_unsafe_item);
- spin_unlock(&ci->i_unsafe_lock);
- ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
- }
-}
-
-/*
- * Wait on any unsafe replies for the given inode. First wait on the
- * newest request, and make that the upper bound. Then, if there are
- * more requests, keep waiting on the oldest as long as it is still older
- * than the original request.
- */
-void ceph_sync_write_wait(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct list_head *head = &ci->i_unsafe_writes;
- struct ceph_osd_request *req;
- u64 last_tid;
-
- if (!S_ISREG(inode->i_mode))
- return;
-
- spin_lock(&ci->i_unsafe_lock);
- if (list_empty(head))
- goto out;
-
- /* set upper bound as _last_ entry in chain */
-
- req = list_last_entry(head, struct ceph_osd_request,
- r_unsafe_item);
- last_tid = req->r_tid;
-
- do {
- ceph_osdc_get_request(req);
- spin_unlock(&ci->i_unsafe_lock);
-
- dout("sync_write_wait on tid %llu (until %llu)\n",
- req->r_tid, last_tid);
- wait_for_completion(&req->r_done_completion);
- ceph_osdc_put_request(req);
-
- spin_lock(&ci->i_unsafe_lock);
- /*
- * from here on look at first entry in chain, since we
- * only want to wait for anything older than last_tid
- */
- if (list_empty(head))
- break;
- req = list_first_entry(head, struct ceph_osd_request,
- r_unsafe_item);
- } while (req->r_tid < last_tid);
-out:
- spin_unlock(&ci->i_unsafe_lock);
-}
-
static ssize_t
ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
struct ceph_snap_context *snapc,
if (ret2 < 0)
dout("invalidate_inode_pages2_range returned %d\n", ret2);
- flags = CEPH_OSD_FLAG_ORDERSNAP |
- CEPH_OSD_FLAG_ONDISK |
- CEPH_OSD_FLAG_WRITE;
+ flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE;
} else {
flags = CEPH_OSD_FLAG_READ;
}
if (ret < 0)
dout("invalidate_inode_pages2_range returned %d\n", ret);
- flags = CEPH_OSD_FLAG_ORDERSNAP |
- CEPH_OSD_FLAG_ONDISK |
- CEPH_OSD_FLAG_WRITE |
- CEPH_OSD_FLAG_ACK;
+ flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE;
while ((len = iov_iter_count(from)) > 0) {
size_t left;
goto out;
}
- /* get a second commit callback */
- req->r_unsafe_callback = ceph_sync_write_unsafe;
req->r_inode = inode;
osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
ceph_vino(inode),
offset, length,
0, 1, op,
- CEPH_OSD_FLAG_WRITE |
- CEPH_OSD_FLAG_ONDISK,
+ CEPH_OSD_FLAG_WRITE,
NULL, 0, 0, false);
if (IS_ERR(req)) {
ret = PTR_ERR(req);
ci->i_rdcache_gen = 0;
ci->i_rdcache_revoking = 0;
- INIT_LIST_HEAD(&ci->i_unsafe_writes);
INIT_LIST_HEAD(&ci->i_unsafe_dirops);
INIT_LIST_HEAD(&ci->i_unsafe_iops);
spin_lock_init(&ci->i_unsafe_lock);
return 1;
}
-void ceph_evict_inode(struct inode *inode)
-{
- /* wait unsafe sync writes */
- ceph_sync_write_wait(inode);
- truncate_inode_pages_final(&inode->i_data);
- clear_inode(inode);
-}
-
static inline blkcnt_t calc_inode_blocks(u64 size)
{
return (size + (1<<9) - 1) >> 9;
static void update_dentry_lease(struct dentry *dentry,
struct ceph_mds_reply_lease *lease,
struct ceph_mds_session *session,
- unsigned long from_time)
+ unsigned long from_time,
+ struct ceph_vino *tgt_vino,
+ struct ceph_vino *dir_vino)
{
struct ceph_dentry_info *di = ceph_dentry(dentry);
long unsigned duration = le32_to_cpu(lease->duration_ms);
long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
struct inode *dir;
+ /*
+ * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that
+ * we expect a negative dentry.
+ */
+ if (!tgt_vino && d_really_is_positive(dentry))
+ return;
+
+ if (tgt_vino && (d_really_is_negative(dentry) ||
+ !ceph_ino_compare(d_inode(dentry), tgt_vino)))
+ return;
+
spin_lock(&dentry->d_lock);
dout("update_dentry_lease %p duration %lu ms ttl %lu\n",
dentry, duration, ttl);
- /* make lease_rdcache_gen match directory */
dir = d_inode(dentry->d_parent);
+ /* make sure parent matches dir_vino */
+ if (!ceph_ino_compare(dir, dir_vino))
+ goto out_unlock;
+
/* only track leases on regular dentries */
if (ceph_snap(dir) != CEPH_NOSNAP)
goto out_unlock;
*
* Called with snap_rwsem (read).
*/
-int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
- struct ceph_mds_session *session)
+int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
{
+ struct ceph_mds_session *session = req->r_session;
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
struct inode *in = NULL;
- struct ceph_vino vino;
+ struct ceph_vino tvino, dvino;
struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
int err = 0;
dout("fill_trace %p is_dentry %d is_target %d\n", req,
rinfo->head->is_dentry, rinfo->head->is_target);
-#if 0
- /*
- * Debugging hook:
- *
- * If we resend completed ops to a recovering mds, we get no
- * trace. Since that is very rare, pretend this is the case
- * to ensure the 'no trace' handlers in the callers behave.
- *
- * Fill in inodes unconditionally to avoid breaking cap
- * invariants.
- */
- if (rinfo->head->op & CEPH_MDS_OP_WRITE) {
- pr_info("fill_trace faking empty trace on %lld %s\n",
- req->r_tid, ceph_mds_op_name(rinfo->head->op));
- if (rinfo->head->is_dentry) {
- rinfo->head->is_dentry = 0;
- err = fill_inode(req->r_locked_dir,
- &rinfo->diri, rinfo->dirfrag,
- session, req->r_request_started, -1);
- }
- if (rinfo->head->is_target) {
- rinfo->head->is_target = 0;
- ininfo = rinfo->targeti.in;
- vino.ino = le64_to_cpu(ininfo->ino);
- vino.snap = le64_to_cpu(ininfo->snapid);
- in = ceph_get_inode(sb, vino);
- err = fill_inode(in, &rinfo->targeti, NULL,
- session, req->r_request_started,
- req->r_fmode);
- iput(in);
- }
- }
-#endif
-
if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
dout("fill_trace reply is empty!\n");
- if (rinfo->head->result == 0 && req->r_locked_dir)
+ if (rinfo->head->result == 0 && req->r_parent)
ceph_invalidate_dir_request(req);
return 0;
}
if (rinfo->head->is_dentry) {
- struct inode *dir = req->r_locked_dir;
+ struct inode *dir = req->r_parent;
if (dir) {
err = fill_inode(dir, NULL,
dname.name = rinfo->dname;
dname.len = rinfo->dname_len;
dname.hash = full_name_hash(parent, dname.name, dname.len);
- vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
- vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+ tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+ tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
retry_lookup:
dn = d_lookup(parent, &dname);
dout("d_lookup on parent=%p name=%.*s got %p\n",
}
err = 0;
} else if (d_really_is_positive(dn) &&
- (ceph_ino(d_inode(dn)) != vino.ino ||
- ceph_snap(d_inode(dn)) != vino.snap)) {
+ (ceph_ino(d_inode(dn)) != tvino.ino ||
+ ceph_snap(d_inode(dn)) != tvino.snap)) {
dout(" dn %p points to wrong inode %p\n",
dn, d_inode(dn));
d_delete(dn);
}
if (rinfo->head->is_target) {
- vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
- vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+ tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+ tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
- in = ceph_get_inode(sb, vino);
+ in = ceph_get_inode(sb, tvino);
if (IS_ERR(in)) {
err = PTR_ERR(in);
goto done;
err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL,
session, req->r_request_started,
- (!req->r_aborted && rinfo->head->result == 0) ?
- req->r_fmode : -1,
+ (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
+ rinfo->head->result == 0) ? req->r_fmode : -1,
&req->r_caps_reservation);
if (err < 0) {
pr_err("fill_inode badness %p %llx.%llx\n",
* ignore null lease/binding on snapdir ENOENT, or else we
* will have trouble splicing in the virtual snapdir later
*/
- if (rinfo->head->is_dentry && !req->r_aborted &&
- req->r_locked_dir &&
+ if (rinfo->head->is_dentry &&
+ !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
+ test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
(rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
fsc->mount_options->snapdir_name,
req->r_dentry->d_name.len))) {
* mknod symlink mkdir : null -> new inode
* unlink : linked -> null
*/
- struct inode *dir = req->r_locked_dir;
+ struct inode *dir = req->r_parent;
struct dentry *dn = req->r_dentry;
bool have_dir_cap, have_lease;
BUG_ON(!dn);
BUG_ON(!dir);
BUG_ON(d_inode(dn->d_parent) != dir);
- BUG_ON(ceph_ino(dir) !=
- le64_to_cpu(rinfo->diri.in->ino));
- BUG_ON(ceph_snap(dir) !=
- le64_to_cpu(rinfo->diri.in->snapid));
+
+ dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
+ dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
+
+ BUG_ON(ceph_ino(dir) != dvino.ino);
+ BUG_ON(ceph_snap(dir) != dvino.snap);
/* do we have a lease on the whole dir? */
have_dir_cap =
ceph_dir_clear_ordered(dir);
dout("d_delete %p\n", dn);
d_delete(dn);
- } else {
- if (have_lease && d_unhashed(dn))
+ } else if (have_lease) {
+ if (d_unhashed(dn))
d_add(dn, NULL);
update_dentry_lease(dn, rinfo->dlease,
session,
- req->r_request_started);
+ req->r_request_started,
+ NULL, &dvino);
}
goto done;
}
have_lease = false;
}
- if (have_lease)
+ if (have_lease) {
+ tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+ tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
update_dentry_lease(dn, rinfo->dlease, session,
- req->r_request_started);
+ req->r_request_started,
+ &tvino, &dvino);
+ }
dout(" final dn %p\n", dn);
- } else if (!req->r_aborted &&
- (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
- req->r_op == CEPH_MDS_OP_MKSNAP)) {
+ } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
+ req->r_op == CEPH_MDS_OP_MKSNAP) &&
+ !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
struct dentry *dn = req->r_dentry;
- struct inode *dir = req->r_locked_dir;
+ struct inode *dir = req->r_parent;
/* fill out a snapdir LOOKUPSNAP dentry */
BUG_ON(!dn);
goto done;
}
req->r_dentry = dn; /* may have spliced */
+ } else if (rinfo->head->is_dentry) {
+ struct ceph_vino *ptvino = NULL;
+
+ if ((le32_to_cpu(rinfo->diri.in->cap.caps) & CEPH_CAP_FILE_SHARED) ||
+ le32_to_cpu(rinfo->dlease->duration_ms)) {
+ dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
+ dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
+
+ if (rinfo->head->is_target) {
+ tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+ tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+ ptvino = &tvino;
+ }
+
+ update_dentry_lease(req->r_dentry, rinfo->dlease,
+ session, req->r_request_started, ptvino,
+ &dvino);
+ } else {
+ dout("%s: no dentry lease or dir cap\n", __func__);
+ }
}
done:
dout("fill_trace done err=%d\n", err);
u32 fpos_offset;
struct ceph_readdir_cache_control cache_ctl = {};
- if (req->r_aborted)
+ if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
return readdir_prepopulate_inodes_only(req, session);
if (rinfo->hash_order && req->r_path2) {
/* FIXME: release caps/leases if error occurs */
for (i = 0; i < rinfo->dir_nr; i++) {
struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
- struct ceph_vino vino;
+ struct ceph_vino tvino, dvino;
dname.name = rde->name;
dname.len = rde->name_len;
dname.hash = full_name_hash(parent, dname.name, dname.len);
- vino.ino = le64_to_cpu(rde->inode.in->ino);
- vino.snap = le64_to_cpu(rde->inode.in->snapid);
+ tvino.ino = le64_to_cpu(rde->inode.in->ino);
+ tvino.snap = le64_to_cpu(rde->inode.in->snapid);
if (rinfo->hash_order) {
u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
goto out;
}
} else if (d_really_is_positive(dn) &&
- (ceph_ino(d_inode(dn)) != vino.ino ||
- ceph_snap(d_inode(dn)) != vino.snap)) {
+ (ceph_ino(d_inode(dn)) != tvino.ino ||
+ ceph_snap(d_inode(dn)) != tvino.snap)) {
dout(" dn %p points to wrong inode %p\n",
dn, d_inode(dn));
d_delete(dn);
if (d_really_is_positive(dn)) {
in = d_inode(dn);
} else {
- in = ceph_get_inode(parent->d_sb, vino);
+ in = ceph_get_inode(parent->d_sb, tvino);
if (IS_ERR(in)) {
dout("new_inode badness\n");
d_drop(dn);
ceph_dentry(dn)->offset = rde->offset;
+ dvino = ceph_vino(d_inode(parent));
update_dentry_lease(dn, rde->lease, req->r_session,
- req->r_request_started);
+ req->r_request_started, &tvino, &dvino);
if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
ret = fill_readdir_cache(d_inode(parent), dn,
}
out:
if (err == 0 && skipped == 0) {
- req->r_did_prepopulate = true;
+ set_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags);
req->r_readdir_cache_idx = cache_ctl.index;
}
ceph_readdir_cache_release(&cache_ctl);
mutex_lock(&ci->i_truncate_mutex);
- if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+ if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n",
inode, ceph_ino(inode));
mapping_set_error(inode->i_mapping, -EIO);
l.stripe_count = ci->i_layout.stripe_count;
l.object_size = ci->i_layout.object_size;
l.data_pool = ci->i_layout.pool_id;
- l.preferred_osd = (s32)-1;
+ l.preferred_osd = -1;
if (copy_to_user(arg, &l, sizeof(l)))
return -EFAULT;
}
nl.data_pool = ci->i_layout.pool_id;
/* this is obsolete, and always -1 */
- nl.preferred_osd = le64_to_cpu(-1);
+ nl.preferred_osd = -1;
err = __validate_layout(mdsc, &nl);
if (err)
ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
iput(req->r_inode);
}
- if (req->r_locked_dir)
- ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
+ if (req->r_parent)
+ ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
iput(req->r_target_inode);
if (req->r_dentry)
dput(req->r_dentry);
{
dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+ /* Never leave an unregistered request on an unsafe list! */
+ list_del_init(&req->r_unsafe_item);
+
if (req->r_tid == mdsc->oldest_tid) {
struct rb_node *p = rb_next(&req->r_node);
mdsc->oldest_tid = 0;
erase_request(&mdsc->request_tree, req);
- if (req->r_unsafe_dir && req->r_got_unsafe) {
+ if (req->r_unsafe_dir &&
+ test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
spin_lock(&ci->i_unsafe_lock);
list_del_init(&req->r_unsafe_dir_item);
spin_unlock(&ci->i_unsafe_lock);
}
- if (req->r_target_inode && req->r_got_unsafe) {
+ if (req->r_target_inode &&
+ test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
spin_lock(&ci->i_unsafe_lock);
list_del_init(&req->r_unsafe_target_item);
ceph_mdsc_put_request(req);
}
+/*
+ * Walk back up the dentry tree until we hit a dentry representing a
+ * non-snapshot inode. We do this using the rcu_read_lock (which must be held
+ * when calling this) to ensure that the objects won't disappear while we're
+ * working with them. Once we hit a candidate dentry, we attempt to take a
+ * reference to it, and return that as the result.
+ */
+static struct inode *get_nonsnap_parent(struct dentry *dentry)
+{
+ struct inode *inode = NULL;
+
+ while (dentry && !IS_ROOT(dentry)) {
+ inode = d_inode_rcu(dentry);
+ if (!inode || ceph_snap(inode) == CEPH_NOSNAP)
+ break;
+ dentry = dentry->d_parent;
+ }
+ if (inode)
+ inode = igrab(inode);
+ return inode;
+}
+
/*
* Choose mds to send request to next. If there is a hint set in the
* request (e.g., due to a prior forward hint from the mds), use that.
*
* Called under mdsc->mutex.
*/
-static struct dentry *get_nonsnap_parent(struct dentry *dentry)
-{
- /*
- * we don't need to worry about protecting the d_parent access
- * here because we never renaming inside the snapped namespace
- * except to resplice to another snapdir, and either the old or new
- * result is a valid result.
- */
- while (!IS_ROOT(dentry) && ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
- dentry = dentry->d_parent;
- return dentry;
-}
-
static int __choose_mds(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req)
{
int mode = req->r_direct_mode;
int mds = -1;
u32 hash = req->r_direct_hash;
- bool is_hash = req->r_direct_is_hash;
+ bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
/*
* is there a specific mds we should try? ignore hint if we have
inode = NULL;
if (req->r_inode) {
inode = req->r_inode;
+ ihold(inode);
} else if (req->r_dentry) {
/* ignore race with rename; old or new d_parent is okay */
- struct dentry *parent = req->r_dentry->d_parent;
- struct inode *dir = d_inode(parent);
+ struct dentry *parent;
+ struct inode *dir;
+
+ rcu_read_lock();
+ parent = req->r_dentry->d_parent;
+ dir = req->r_parent ? : d_inode_rcu(parent);
- if (dir->i_sb != mdsc->fsc->sb) {
- /* not this fs! */
+ if (!dir || dir->i_sb != mdsc->fsc->sb) {
+ /* not this fs or parent went negative */
inode = d_inode(req->r_dentry);
+ if (inode)
+ ihold(inode);
} else if (ceph_snap(dir) != CEPH_NOSNAP) {
/* direct snapped/virtual snapdir requests
* based on parent dir inode */
- struct dentry *dn = get_nonsnap_parent(parent);
- inode = d_inode(dn);
+ inode = get_nonsnap_parent(parent);
dout("__choose_mds using nonsnap parent %p\n", inode);
} else {
/* dentry target */
inode = d_inode(req->r_dentry);
if (!inode || mode == USE_AUTH_MDS) {
/* dir + name */
- inode = dir;
+ inode = igrab(dir);
hash = ceph_dentry_hash(dir, req->r_dentry);
is_hash = true;
+ } else {
+ ihold(inode);
}
}
+ rcu_read_unlock();
}
dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
(int)r, frag.ndist);
if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
CEPH_MDS_STATE_ACTIVE)
- return mds;
+ goto out;
}
/* since this file/dir wasn't known to be
inode, ceph_vinop(inode), frag.frag, mds);
if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
CEPH_MDS_STATE_ACTIVE)
- return mds;
+ goto out;
}
}
}
cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
if (!cap) {
spin_unlock(&ci->i_ceph_lock);
+ iput(inode);
goto random;
}
mds = cap->session->s_mds;
inode, ceph_vinop(inode), mds,
cap == ci->i_auth_cap ? "auth " : "", cap);
spin_unlock(&ci->i_ceph_lock);
+out:
+ iput(inode);
return mds;
random:
while (!list_empty(&session->s_unsafe)) {
req = list_first_entry(&session->s_unsafe,
struct ceph_mds_request, r_unsafe_item);
- list_del_init(&req->r_unsafe_item);
pr_warn_ratelimited(" dropping unsafe request %llu\n",
req->r_tid);
__unregister_request(mdsc, req);
ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
if (ci->i_wrbuffer_ref > 0 &&
- ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
+ READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
invalidate = true;
while (!list_empty(&ci->i_cap_flush_list)) {
return path;
}
-static int build_dentry_path(struct dentry *dentry,
+static int build_dentry_path(struct dentry *dentry, struct inode *dir,
const char **ppath, int *ppathlen, u64 *pino,
int *pfreepath)
{
char *path;
- if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_NOSNAP) {
- *pino = ceph_ino(d_inode(dentry->d_parent));
+ rcu_read_lock();
+ if (!dir)
+ dir = d_inode_rcu(dentry->d_parent);
+ if (dir && ceph_snap(dir) == CEPH_NOSNAP) {
+ *pino = ceph_ino(dir);
+ rcu_read_unlock();
*ppath = dentry->d_name.name;
*ppathlen = dentry->d_name.len;
return 0;
}
+ rcu_read_unlock();
path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
if (IS_ERR(path))
return PTR_ERR(path);
* an explicit ino+path.
*/
static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
- const char *rpath, u64 rino,
- const char **ppath, int *pathlen,
+ struct inode *rdiri, const char *rpath,
+ u64 rino, const char **ppath, int *pathlen,
u64 *ino, int *freepath)
{
int r = 0;
dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
ceph_snap(rinode));
} else if (rdentry) {
- r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
+ r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino,
+ freepath);
dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
*ppath);
} else if (rpath || rino) {
int ret;
ret = set_request_path_attr(req->r_inode, req->r_dentry,
- req->r_path1, req->r_ino1.ino,
+ req->r_parent, req->r_path1, req->r_ino1.ino,
&path1, &pathlen1, &ino1, &freepath1);
if (ret < 0) {
msg = ERR_PTR(ret);
}
ret = set_request_path_attr(NULL, req->r_old_dentry,
+ req->r_old_dentry_dir,
req->r_path2, req->r_ino2.ino,
&path2, &pathlen2, &ino2, &freepath2);
if (ret < 0) {
mds, req->r_inode_drop, req->r_inode_unless, 0);
if (req->r_dentry_drop)
releases += ceph_encode_dentry_release(&p, req->r_dentry,
- mds, req->r_dentry_drop, req->r_dentry_unless);
+ req->r_parent, mds, req->r_dentry_drop,
+ req->r_dentry_unless);
if (req->r_old_dentry_drop)
releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
- mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
+ req->r_old_dentry_dir, mds,
+ req->r_old_dentry_drop,
+ req->r_old_dentry_unless);
if (req->r_old_inode_drop)
releases += ceph_encode_inode_release(&p,
d_inode(req->r_old_dentry),
dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
- if (req->r_got_unsafe) {
+ if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
void *p;
/*
* Replay. Do not regenerate message (and rebuild
rhead = msg->front.iov_base;
rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
- if (req->r_got_unsafe)
+ if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
flags |= CEPH_MDS_FLAG_REPLAY;
- if (req->r_locked_dir)
+ if (req->r_parent)
flags |= CEPH_MDS_FLAG_WANT_DENTRY;
rhead->flags = cpu_to_le32(flags);
rhead->num_fwd = req->r_num_fwd;
rhead->num_retry = req->r_attempts - 1;
rhead->ino = 0;
- dout(" r_locked_dir = %p\n", req->r_locked_dir);
+ dout(" r_parent = %p\n", req->r_parent);
return 0;
}
int mds = -1;
int err = 0;
- if (req->r_err || req->r_got_result) {
- if (req->r_aborted)
+ if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
+ if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
__unregister_request(mdsc, req);
goto out;
}
err = -EIO;
goto finish;
}
- if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+ if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
dout("do_request forced umount\n");
err = -EIO;
goto finish;
}
- if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) {
+ if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) {
if (mdsc->mdsmap_err) {
err = mdsc->mdsmap_err;
dout("do_request mdsmap err %d\n", err);
while (p) {
req = rb_entry(p, struct ceph_mds_request, r_node);
p = rb_next(p);
- if (req->r_got_unsafe)
+ if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
continue;
if (req->r_attempts > 0)
continue; /* only new requests */
dout("do_request on %p\n", req);
- /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */
+ /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
if (req->r_inode)
ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
- if (req->r_locked_dir)
- ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
+ if (req->r_parent)
+ ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
if (req->r_old_dentry_dir)
ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
CEPH_CAP_PIN);
mutex_lock(&mdsc->mutex);
/* only abort if we didn't race with a real reply */
- if (req->r_got_result) {
+ if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
err = le32_to_cpu(req->r_reply_info.head->result);
} else if (err < 0) {
dout("aborted request %lld with %d\n", req->r_tid, err);
*/
mutex_lock(&req->r_fill_mutex);
req->r_err = err;
- req->r_aborted = true;
+ set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
mutex_unlock(&req->r_fill_mutex);
- if (req->r_locked_dir &&
+ if (req->r_parent &&
(req->r_op & CEPH_MDS_OP_WRITE))
ceph_invalidate_dir_request(req);
} else {
*/
void ceph_invalidate_dir_request(struct ceph_mds_request *req)
{
- struct inode *inode = req->r_locked_dir;
+ struct inode *inode = req->r_parent;
dout("invalidate_dir_request %p (complete, lease(s))\n", inode);
}
/* dup? */
- if ((req->r_got_unsafe && !head->safe) ||
- (req->r_got_safe && head->safe)) {
+ if ((test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags) && !head->safe) ||
+ (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags) && head->safe)) {
pr_warn("got a dup %s reply on %llu from mds%d\n",
head->safe ? "safe" : "unsafe", tid, mds);
mutex_unlock(&mdsc->mutex);
goto out;
}
- if (req->r_got_safe) {
+ if (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags)) {
pr_warn("got unsafe after safe on %llu from mds%d\n",
tid, mds);
mutex_unlock(&mdsc->mutex);
if (head->safe) {
- req->r_got_safe = true;
+ set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags);
__unregister_request(mdsc, req);
- if (req->r_got_unsafe) {
+ if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
/*
* We already handled the unsafe response, now do the
* cleanup. No need to examine the response; the MDS
* useful we could do with a revised return value.
*/
dout("got safe reply %llu, mds%d\n", tid, mds);
- list_del_init(&req->r_unsafe_item);
/* last unsafe request during umount? */
if (mdsc->stopping && !__get_oldest_req(mdsc))
goto out;
}
} else {
- req->r_got_unsafe = true;
+ set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags);
list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
if (req->r_unsafe_dir) {
struct ceph_inode_info *ci =
/* insert trace into our cache */
mutex_lock(&req->r_fill_mutex);
current->journal_info = req;
- err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
+ err = ceph_fill_trace(mdsc->fsc->sb, req);
if (err == 0) {
if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
req->r_op == CEPH_MDS_OP_LSSNAP))
if (realm)
ceph_put_snap_realm(mdsc, realm);
- if (err == 0 && req->r_got_unsafe && req->r_target_inode) {
+ if (err == 0 && req->r_target_inode &&
+ test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
spin_lock(&ci->i_unsafe_lock);
list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops);
}
out_err:
mutex_lock(&mdsc->mutex);
- if (!req->r_aborted) {
+ if (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
if (err) {
req->r_err = err;
} else {
req->r_reply = ceph_msg_get(msg);
- req->r_got_result = true;
+ set_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags);
}
} else {
dout("reply arrived after request %lld was aborted\n", tid);
goto out; /* dup reply? */
}
- if (req->r_aborted) {
+ if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
dout("forward tid %llu aborted, unregistering\n", tid);
__unregister_request(mdsc, req);
} else if (fwd_seq <= req->r_num_fwd) {
/* resend. forward race not possible; mds would drop */
dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
BUG_ON(req->r_err);
- BUG_ON(req->r_got_result);
+ BUG_ON(test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags));
req->r_attempts = 0;
req->r_num_fwd = fwd_seq;
req->r_resend_mds = next_mds;
while (p) {
req = rb_entry(p, struct ceph_mds_request, r_node);
p = rb_next(p);
- if (req->r_got_unsafe)
+ if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
continue;
if (req->r_attempts == 0)
continue; /* only old requests */
{
u64 want_tid, want_flush;
- if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
+ if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
return;
dout("sync\n");
*/
static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
{
- if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
+ if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
return true;
return atomic_read(&mdsc->num_sessions) <= skipped;
}
char *r_path1, *r_path2;
struct ceph_vino r_ino1, r_ino2;
- struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
+ struct inode *r_parent; /* parent dir inode */
struct inode *r_target_inode; /* resulting inode */
+#define CEPH_MDS_R_DIRECT_IS_HASH (1) /* r_direct_hash is valid */
+#define CEPH_MDS_R_ABORTED (2) /* call was aborted */
+#define CEPH_MDS_R_GOT_UNSAFE (3) /* got an unsafe reply */
+#define CEPH_MDS_R_GOT_SAFE (4) /* got a safe reply */
+#define CEPH_MDS_R_GOT_RESULT (5) /* got a result */
+#define CEPH_MDS_R_DID_PREPOPULATE (6) /* prepopulated readdir */
+#define CEPH_MDS_R_PARENT_LOCKED (7) /* is r_parent->i_rwsem wlocked? */
+ unsigned long r_req_flags;
+
struct mutex r_fill_mutex;
union ceph_mds_request_args r_args;
/* for choosing which mds to send this request to */
int r_direct_mode;
u32 r_direct_hash; /* choose dir frag based on this dentry hash */
- bool r_direct_is_hash; /* true if r_direct_hash is valid */
/* data payload is used for xattr ops */
struct ceph_pagelist *r_pagelist;
struct ceph_mds_reply_info_parsed r_reply_info;
struct page *r_locked_page;
int r_err;
- bool r_aborted;
unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */
unsigned long r_started; /* start time to measure timeout against */
ceph_mds_request_callback_t r_callback;
ceph_mds_request_wait_callback_t r_wait_for_completion;
struct list_head r_unsafe_item; /* per-session unsafe list item */
- bool r_got_unsafe, r_got_safe, r_got_result;
- bool r_did_prepopulate;
long long r_dir_release_cnt;
long long r_dir_ordered_cnt;
int r_readdir_cache_idx;
.destroy_inode = ceph_destroy_inode,
.write_inode = ceph_write_inode,
.drop_inode = ceph_drop_inode,
- .evict_inode = ceph_evict_inode,
.sync_fs = ceph_sync_fs,
.put_super = ceph_put_super,
.show_options = ceph_show_options,
fsc->backing_dev_info.ra_pages =
VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+ if (fsc->mount_options->rsize > fsc->mount_options->rasize &&
+ fsc->mount_options->rsize >= PAGE_SIZE)
+ fsc->backing_dev_info.io_pages =
+ (fsc->mount_options->rsize + PAGE_SIZE - 1)
+ >> PAGE_SHIFT;
+ else if (fsc->mount_options->rsize == 0)
+ fsc->backing_dev_info.io_pages = ULONG_MAX;
+
err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
atomic_long_inc_return(&bdi_seq));
if (!err)
#define ceph_test_mount_opt(fsc, opt) \
(!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
-#define CEPH_RSIZE_DEFAULT 0 /* max read size */
-#define CEPH_RASIZE_DEFAULT (8192*1024) /* readahead */
+#define CEPH_RSIZE_DEFAULT (64*1024*1024) /* max read size */
+#define CEPH_RASIZE_DEFAULT (8192*1024) /* max readahead */
#define CEPH_MAX_READDIR_DEFAULT 1024
#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */
u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
- struct list_head i_unsafe_writes; /* uncommitted sync writes */
struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
struct list_head i_unsafe_iops; /* uncommitted mds inode ops */
spinlock_t i_unsafe_lock;
}
/* what the mds thinks we want */
-extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
+extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check);
extern void ceph_caps_init(struct ceph_mds_client *mdsc);
extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
extern struct inode *ceph_alloc_inode(struct super_block *sb);
extern void ceph_destroy_inode(struct inode *inode);
extern int ceph_drop_inode(struct inode *inode);
-extern void ceph_evict_inode(struct inode *inode);
extern struct inode *ceph_get_inode(struct super_block *sb,
struct ceph_vino vino);
u64 time_warp_seq, struct timespec *ctime,
struct timespec *mtime, struct timespec *atime);
extern int ceph_fill_trace(struct super_block *sb,
- struct ceph_mds_request *req,
- struct ceph_mds_session *session);
+ struct ceph_mds_request *req);
extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct ceph_mds_session *session);
extern int ceph_encode_inode_release(void **p, struct inode *inode,
int mds, int drop, int unless, int force);
extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
+ struct inode *dir,
int mds, int drop, int unless);
extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
extern int ceph_release(struct inode *inode, struct file *filp);
extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
char *data, size_t len);
-extern void ceph_sync_write_wait(struct inode *inode);
+
/* dir.c */
extern const struct file_operations ceph_dir_fops;
extern const struct file_operations ceph_snapdir_fops;
dprintk("lockd_inet6addr_event: removed %pI6\n", &ifa->addr);
sin6.sin6_family = AF_INET6;
sin6.sin6_addr = ifa->addr;
+ if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL)
+ sin6.sin6_scope_id = ifa->idev->dev->ifindex;
svc_age_temp_xprts_now(nlmsvc_rqst->rq_server,
(struct sockaddr *)&sin6);
}
.vs_proc = nfs4_callback_procedures1,
.vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
.vs_dispatch = NULL,
- .vs_hidden = 1,
+ .vs_hidden = true,
+ .vs_need_cong_ctrl = true,
};
struct svc_version nfs4_callback_version4 = {
.vs_proc = nfs4_callback_procedures1,
.vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
.vs_dispatch = NULL,
- .vs_hidden = 1,
+ .vs_hidden = true,
+ .vs_need_cong_ctrl = true,
};
{ NFSEXP_NOAUTHNLM, {"insecure_locks", ""}},
{ NFSEXP_V4ROOT, {"v4root", ""}},
{ NFSEXP_PNFS, {"pnfs", ""}},
+ { NFSEXP_SECURITY_LABEL, {"security_label", ""}},
{ 0, {"", ""}}
};
.vs_proc = nfsd_acl_procedures2,
.vs_dispatch = nfsd_dispatch,
.vs_xdrsize = NFS3_SVC_XDRSIZE,
- .vs_hidden = 0,
};
.vs_proc = nfsd_acl_procedures3,
.vs_dispatch = nfsd_dispatch,
.vs_xdrsize = NFS3_SVC_XDRSIZE,
- .vs_hidden = 0,
};
fh_copy(&resp->fh, &argp->fh);
resp->committed = argp->stable;
- nfserr = nfsd_write(rqstp, &resp->fh, NULL,
- argp->offset,
- rqstp->rq_vec, argp->vlen,
- &cnt,
- &resp->committed);
+ nfserr = nfsd_write(rqstp, &resp->fh, argp->offset,
+ rqstp->rq_vec, argp->vlen,
+ &cnt, resp->committed);
resp->count = cnt;
RETURN_STATUS(nfserr);
}
p = xdr_inline_decode(xdr, length + 4);
if (unlikely(p == NULL))
goto out_overflow;
+ p += XDR_QUADLEN(length);
hdr->nops = be32_to_cpup(p);
return 0;
out_overflow:
struct nfsd4_callback *cb)
{
struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
- struct nfs4_sessionid id;
- int status;
+ int status = -ESERVERFAULT;
__be32 *p;
u32 dummy;
- status = -ESERVERFAULT;
-
/*
* If the server returns different values for sessionID, slotID or
* sequence number, the server is looney tunes.
p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4 + 4 + 4);
if (unlikely(p == NULL))
goto out_overflow;
- memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
- if (memcmp(id.data, session->se_sessionid.data,
- NFS4_MAX_SESSIONID_LEN) != 0) {
+
+ if (memcmp(p, session->se_sessionid.data, NFS4_MAX_SESSIONID_LEN)) {
dprintk("NFS: %s Invalid session id\n", __func__);
goto out;
}
return 0;
}
+void cleanup_callback_cred(void)
+{
+ if (callback_cred) {
+ put_rpccred(callback_cred);
+ callback_cred = NULL;
+ }
+}
+
static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses)
{
if (clp->cl_minorversion == 0) {
{
__be32 status;
u32 id = -1;
+
+ if (name == NULL || namelen == 0)
+ return nfserr_inval;
+
status = do_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, &id);
*uid = make_kuid(&init_user_ns, id);
if (!uid_valid(*uid))
{
__be32 status;
u32 id = -1;
+
+ if (name == NULL || namelen == 0)
+ return nfserr_inval;
+
status = do_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, &id);
*gid = make_kgid(&init_user_ns, id);
if (!gid_valid(*gid))
u32 *bmval, u32 *writable)
{
struct dentry *dentry = cstate->current_fh.fh_dentry;
+ struct svc_export *exp = cstate->current_fh.fh_export;
if (!nfsd_attrs_supported(cstate->minorversion, bmval))
return nfserr_attrnotsupp;
if ((bmval[0] & FATTR4_WORD0_ACL) && !IS_POSIXACL(d_inode(dentry)))
return nfserr_attrnotsupp;
+ if ((bmval[2] & FATTR4_WORD2_SECURITY_LABEL) &&
+ !(exp->ex_flags & NFSEXP_SECURITY_LABEL))
+ return nfserr_attrnotsupp;
if (writable && !bmval_is_subset(bmval, writable))
return nfserr_inval;
if (writable && (bmval[2] & FATTR4_WORD2_MODE_UMASK) &&
status = nfsd_vfs_write(rqstp, &cstate->current_fh, filp,
write->wr_offset, rqstp->rq_vec, nvecs, &cnt,
- &write->wr_how_written);
+ write->wr_how_written);
fput(filp);
write->wr_bytes_written = cnt;
return (op_encode_hdr_size + op_encode_stateid_maxsz)* sizeof(__be32);
}
+static inline u32 nfsd4_access_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ /* ac_supported, ac_resp_access */
+ return (op_encode_hdr_size + 2)* sizeof(__be32);
+}
+
static inline u32 nfsd4_commit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
{
return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32);
return ret;
}
+static inline u32 nfsd4_getfh_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + 1) * sizeof(__be32) + NFS4_FHSIZE;
+}
+
static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
{
return (op_encode_hdr_size + op_encode_change_info_maxsz)
XDR_QUADLEN(rlen)) * sizeof(__be32);
}
+static inline u32 nfsd4_readlink_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + 1) * sizeof(__be32) + PAGE_SIZE;
+}
+
static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
{
return (op_encode_hdr_size + op_encode_change_info_maxsz)
+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) * sizeof(__be32);
}
+static inline u32 nfsd4_test_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + 1 + op->u.test_stateid.ts_num_ids)
+ * sizeof(__be32);
+}
+
static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
{
return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
}
+static inline u32 nfsd4_secinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + RPC_AUTH_MAXFLAVOR *
+ (4 + XDR_QUADLEN(GSS_OID_MAX_LEN))) * sizeof(__be32);
+}
+
static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
{
return (op_encode_hdr_size + 2 + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) *
}
#ifdef CONFIG_NFSD_PNFS
+static inline u32 nfsd4_getdeviceinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ u32 maxcount = 0, rlen = 0;
+
+ maxcount = svc_max_payload(rqstp);
+ rlen = min(op->u.getdeviceinfo.gd_maxcount, maxcount);
+
+ return (op_encode_hdr_size +
+ 1 /* gd_layout_type*/ +
+ XDR_QUADLEN(rlen) +
+ 2 /* gd_notify_types */) * sizeof(__be32);
+}
+
/*
* At this stage we don't really know what layout driver will handle the request,
* so we need to define an arbitrary upper bound here.
}
#endif /* CONFIG_NFSD_PNFS */
+
+static inline u32 nfsd4_seek_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + 3) * sizeof(__be32);
+}
+
static struct nfsd4_operation nfsd4_ops[] = {
[OP_ACCESS] = {
.op_func = (nfsd4op_func)nfsd4_access,
.op_name = "OP_ACCESS",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_access_rsize,
},
[OP_CLOSE] = {
.op_func = (nfsd4op_func)nfsd4_close,
[OP_GETFH] = {
.op_func = (nfsd4op_func)nfsd4_getfh,
.op_name = "OP_GETFH",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_getfh_rsize,
},
[OP_LINK] = {
.op_func = (nfsd4op_func)nfsd4_link,
[OP_LOCKT] = {
.op_func = (nfsd4op_func)nfsd4_lockt,
.op_name = "OP_LOCKT",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_lock_rsize,
},
[OP_LOCKU] = {
.op_func = (nfsd4op_func)nfsd4_locku,
.op_func = (nfsd4op_func)nfsd4_lookup,
.op_flags = OP_HANDLES_WRONGSEC | OP_CLEAR_STATEID,
.op_name = "OP_LOOKUP",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_LOOKUPP] = {
.op_func = (nfsd4op_func)nfsd4_lookupp,
.op_flags = OP_HANDLES_WRONGSEC | OP_CLEAR_STATEID,
.op_name = "OP_LOOKUPP",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_NVERIFY] = {
.op_func = (nfsd4op_func)nfsd4_nverify,
.op_name = "OP_NVERIFY",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_OPEN] = {
.op_func = (nfsd4op_func)nfsd4_open,
[OP_READLINK] = {
.op_func = (nfsd4op_func)nfsd4_readlink,
.op_name = "OP_READLINK",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_readlink_rsize,
},
[OP_REMOVE] = {
.op_func = (nfsd4op_func)nfsd4_remove,
.op_func = (nfsd4op_func)nfsd4_secinfo,
.op_flags = OP_HANDLES_WRONGSEC,
.op_name = "OP_SECINFO",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_secinfo_rsize,
},
[OP_SETATTR] = {
.op_func = (nfsd4op_func)nfsd4_setattr,
[OP_VERIFY] = {
.op_func = (nfsd4op_func)nfsd4_verify,
.op_name = "OP_VERIFY",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_WRITE] = {
.op_func = (nfsd4op_func)nfsd4_write,
.op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
.op_flags = OP_HANDLES_WRONGSEC,
.op_name = "OP_SECINFO_NO_NAME",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_secinfo_rsize,
},
[OP_TEST_STATEID] = {
.op_func = (nfsd4op_func)nfsd4_test_stateid,
.op_flags = ALLOWED_WITHOUT_FH,
.op_name = "OP_TEST_STATEID",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_test_stateid_rsize,
},
[OP_FREE_STATEID] = {
.op_func = (nfsd4op_func)nfsd4_free_stateid,
.op_func = (nfsd4op_func)nfsd4_getdeviceinfo,
.op_flags = ALLOWED_WITHOUT_FH,
.op_name = "OP_GETDEVICEINFO",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_getdeviceinfo_rsize,
},
[OP_LAYOUTGET] = {
.op_func = (nfsd4op_func)nfsd4_layoutget,
[OP_SEEK] = {
.op_func = (nfsd4op_func)nfsd4_seek,
.op_name = "OP_SEEK",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_seek_rsize,
},
};
int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op)
{
- struct nfsd4_operation *opdesc;
- nfsd4op_rsize estimator;
-
if (op->opnum == OP_ILLEGAL)
return op_encode_hdr_size * sizeof(__be32);
- opdesc = OPDESC(op);
- estimator = opdesc->op_rsize_bop;
- return estimator ? estimator(rqstp, op) : PAGE_SIZE;
+
+ BUG_ON(OPDESC(op)->op_rsize_bop == NULL);
+ return OPDESC(op)->op_rsize_bop(rqstp, op);
}
void warn_on_nonidempotent_op(struct nfsd4_op *op)
};
struct svc_version nfsd_version4 = {
- .vs_vers = 4,
- .vs_nproc = 2,
- .vs_proc = nfsd_procedures4,
- .vs_dispatch = nfsd_dispatch,
- .vs_xdrsize = NFS4_SVC_XDRSIZE,
- .vs_rpcb_optnl = 1,
+ .vs_vers = 4,
+ .vs_nproc = 2,
+ .vs_proc = nfsd_procedures4,
+ .vs_dispatch = nfsd_dispatch,
+ .vs_xdrsize = NFS4_SVC_XDRSIZE,
+ .vs_rpcb_optnl = true,
+ .vs_need_cong_ctrl = true,
};
/*
out_err:
conn->cb_addr.ss_family = AF_UNSPEC;
conn->cb_addrlen = 0;
- dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) "
+ dprintk("NFSD: this client (clientid %08x/%08x) "
"will not receive delegations\n",
clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
ret = set_callback_cred();
if (ret)
- return -ENOMEM;
+ return ret;
+
laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4");
if (laundry_wq == NULL) {
ret = -ENOMEM;
- goto out_recovery;
+ goto out_cleanup_cred;
}
ret = nfsd4_create_callback_queue();
if (ret)
goto out_free_laundry;
set_max_delegations();
-
return 0;
out_free_laundry:
destroy_workqueue(laundry_wq);
-out_recovery:
+out_cleanup_cred:
+ cleanup_callback_cred();
return ret;
}
{
destroy_workqueue(laundry_wq);
nfsd4_destroy_callback_queue();
+ cleanup_callback_cred();
}
static void
#define NFSDDBG_FACILITY NFSDDBG_XDR
-u32 nfsd_suppattrs[3][3] = {
+const u32 nfsd_suppattrs[3][3] = {
{NFSD4_SUPPORTED_ATTRS_WORD0,
NFSD4_SUPPORTED_ATTRS_WORD1,
NFSD4_SUPPORTED_ATTRS_WORD2},
READ_BUF(16);
p = xdr_decode_hyper(p, &write->wr_offset);
write->wr_stable_how = be32_to_cpup(p++);
- if (write->wr_stable_how > 2)
+ if (write->wr_stable_how > NFS_FILE_SYNC)
goto xdr_error;
write->wr_buflen = be32_to_cpup(p++);
} else
max_reply += nfsd4_max_reply(argp->rqstp, op);
/*
- * OP_LOCK may return a conflicting lock. (Special case
- * because it will just skip encoding this if it runs
- * out of xdr buffer space, and it is the only operation
- * that behaves this way.)
+ * OP_LOCK and OP_LOCKT may return a conflicting lock.
+ * (Special case because it will just skip encoding this
+ * if it runs out of xdr buffer space, and it is the only
+ * operation that behaves this way.)
*/
- if (op->opnum == OP_LOCK)
+ if (op->opnum == OP_LOCK || op->opnum == OP_LOCKT)
max_reply += NFS4_OPAQUE_LIMIT;
if (op->status) {
DECODE_TAIL;
}
-static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode)
+static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
+ struct svc_export *exp)
{
- if (IS_I_VERSION(inode)) {
+ if (exp->ex_flags & NFSEXP_V4ROOT) {
+ *p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time));
+ *p++ = 0;
+ } else if (IS_I_VERSION(inode)) {
p = xdr_encode_hyper(p, inode->i_version);
} else {
*p++ = cpu_to_be32(stat->ctime.tv_sec);
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
if ((bmval2 & FATTR4_WORD2_SECURITY_LABEL) ||
bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
- err = security_inode_getsecctx(d_inode(dentry),
+ if (exp->ex_flags & NFSEXP_SECURITY_LABEL)
+ err = security_inode_getsecctx(d_inode(dentry),
&context, &contextlen);
+ else
+ err = -EOPNOTSUPP;
contextsupport = (err == 0);
if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
if (err == -EOPNOTSUPP)
p = xdr_reserve_space(xdr, 8);
if (!p)
goto out_resource;
- p = encode_change(p, &stat, d_inode(dentry));
+ p = encode_change(p, &stat, d_inode(dentry), exp);
}
if (bmval0 & FATTR4_WORD0_SIZE) {
p = xdr_reserve_space(xdr, 8);
return rv;
}
+static ssize_t
+nfsd_print_version_support(char *buf, int remaining, const char *sep,
+ unsigned vers, unsigned minor)
+{
+ const char *format = (minor == 0) ? "%s%c%u" : "%s%c%u.%u";
+ bool supported = !!nfsd_vers(vers, NFSD_TEST);
+
+ if (vers == 4 && !nfsd_minorversion(minor, NFSD_TEST))
+ supported = false;
+ return snprintf(buf, remaining, format, sep,
+ supported ? '+' : '-', vers, minor);
+}
+
static ssize_t __write_versions(struct file *file, char *buf, size_t size)
{
char *mesg = buf;
len = qword_get(&mesg, vers, size);
if (len <= 0) return -EINVAL;
do {
+ enum vers_op cmd;
sign = *vers;
if (sign == '+' || sign == '-')
num = simple_strtol((vers+1), &minorp, 0);
if (*minorp == '.') {
if (num != 4)
return -EINVAL;
- minor = simple_strtoul(minorp+1, NULL, 0);
- if (minor == 0)
- return -EINVAL;
- if (nfsd_minorversion(minor, sign == '-' ?
- NFSD_CLEAR : NFSD_SET) < 0)
+ if (kstrtouint(minorp+1, 0, &minor) < 0)
return -EINVAL;
- goto next;
- }
+ } else
+ minor = 0;
+ cmd = sign == '-' ? NFSD_CLEAR : NFSD_SET;
switch(num) {
case 2:
case 3:
- case 4:
- nfsd_vers(num, sign == '-' ? NFSD_CLEAR : NFSD_SET);
+ nfsd_vers(num, cmd);
break;
+ case 4:
+ if (nfsd_minorversion(minor, cmd) >= 0)
+ break;
default:
return -EINVAL;
}
- next:
vers += len + 1;
} while ((len = qword_get(&mesg, vers, size)) > 0);
/* If all get turned off, turn them back on, as
len = 0;
sep = "";
remaining = SIMPLE_TRANSACTION_LIMIT;
- for (num=2 ; num <= 4 ; num++)
- if (nfsd_vers(num, NFSD_AVAIL)) {
- len = snprintf(buf, remaining, "%s%c%d", sep,
- nfsd_vers(num, NFSD_TEST)?'+':'-',
- num);
- sep = " ";
-
- if (len >= remaining)
- break;
- remaining -= len;
- buf += len;
- tlen += len;
- }
- if (nfsd_vers(4, NFSD_AVAIL))
- for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION;
- minor++) {
- len = snprintf(buf, remaining, " %c4.%u",
- (nfsd_vers(4, NFSD_TEST) &&
- nfsd_minorversion(minor, NFSD_TEST)) ?
- '+' : '-',
- minor);
-
+ for (num=2 ; num <= 4 ; num++) {
+ if (!nfsd_vers(num, NFSD_AVAIL))
+ continue;
+ minor = 0;
+ do {
+ len = nfsd_print_version_support(buf, remaining,
+ sep, num, minor);
if (len >= remaining)
- break;
+ goto out;
remaining -= len;
buf += len;
tlen += len;
- }
-
+ minor++;
+ sep = " ";
+ } while (num == 4 && minor <= NFSD_SUPPORTED_MINOR_VERSION);
+ }
+out:
len = snprintf(buf, remaining, "\n");
if (len >= remaining)
return -EINVAL;
FATTR4_WORD2_MODE_UMASK | \
NFSD4_2_SECURITY_ATTRS)
-extern u32 nfsd_suppattrs[3][3];
+extern const u32 nfsd_suppattrs[3][3];
-static inline bool bmval_is_subset(u32 *bm1, u32 *bm2)
+static inline bool bmval_is_subset(const u32 *bm1, const u32 *bm2)
{
return !((bm1[0] & ~bm2[0]) ||
(bm1[1] & ~bm2[1]) ||
(bm1[2] & ~bm2[2]));
}
-static inline bool nfsd_attrs_supported(u32 minorversion, u32 *bmval)
+static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval)
{
return bmval_is_subset(bmval, nfsd_suppattrs[minorversion]);
}
struct nfsd_attrstat *resp)
{
__be32 nfserr;
- int stable = 1;
unsigned long cnt = argp->len;
dprintk("nfsd: WRITE %s %d bytes at %d\n",
SVCFH_fmt(&argp->fh),
argp->len, argp->offset);
- nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
- argp->offset,
- rqstp->rq_vec, argp->vlen,
- &cnt,
- &stable);
+ nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), argp->offset,
+ rqstp->rq_vec, argp->vlen, &cnt, NFS_DATA_SYNC);
return nfsd_return_attrs(nfserr, resp);
}
return 0;
}
+static void
+nfsd_adjust_nfsd_versions4(void)
+{
+ unsigned i;
+
+ for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++) {
+ if (nfsd_supported_minorversions[i])
+ return;
+ }
+ nfsd_vers(4, NFSD_CLEAR);
+}
+
int nfsd_minorversion(u32 minorversion, enum vers_op change)
{
if (minorversion > NFSD_SUPPORTED_MINOR_VERSION)
switch(change) {
case NFSD_SET:
nfsd_supported_minorversions[minorversion] = true;
+ nfsd_vers(4, NFSD_SET);
break;
case NFSD_CLEAR:
nfsd_supported_minorversions[minorversion] = false;
+ nfsd_adjust_nfsd_versions4();
break;
case NFSD_TEST:
return nfsd_supported_minorversions[minorversion];
dprintk("nfsd_inet6addr_event: removed %pI6\n", &ifa->addr);
sin6.sin6_family = AF_INET6;
sin6.sin6_addr = ifa->addr;
+ if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL)
+ sin6.sin6_scope_id = ifa->idev->dev->ifindex;
svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6);
}
extern __be32 nfs4_check_open_reclaim(clientid_t *clid,
struct nfsd4_compound_state *cstate, struct nfsd_net *nn);
extern int set_callback_cred(void);
+extern void cleanup_callback_cred(void);
extern void nfsd4_probe_callback(struct nfs4_client *clp);
extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
__be32 err;
int host_err;
bool get_write_count;
- int size_change = 0;
+ bool size_change = (iap->ia_valid & ATTR_SIZE);
if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
/* Get inode */
err = fh_verify(rqstp, fhp, ftype, accmode);
if (err)
- goto out;
+ return err;
if (get_write_count) {
host_err = fh_want_write(fhp);
if (host_err)
- return nfserrno(host_err);
+ goto out;
}
dentry = fhp->fh_dentry;
iap->ia_valid &= ~ATTR_MODE;
if (!iap->ia_valid)
- goto out;
+ return 0;
nfsd_sanitize_attrs(inode, iap);
+ if (check_guard && guardtime != inode->i_ctime.tv_sec)
+ return nfserr_notsync;
+
/*
* The size case is special, it changes the file in addition to the
- * attributes.
+ * attributes, and file systems don't expect it to be mixed with
+ * "random" attribute changes. We thus split out the size change
+ * into a separate call to ->setattr, and do the rest as a separate
+ * setattr call.
*/
- if (iap->ia_valid & ATTR_SIZE) {
+ if (size_change) {
err = nfsd_get_write_access(rqstp, fhp, iap);
if (err)
- goto out;
- size_change = 1;
+ return err;
+ }
+ fh_lock(fhp);
+ if (size_change) {
/*
* RFC5661, Section 18.30.4:
* Changing the size of a file with SETATTR indirectly
*
* (and similar for the older RFCs)
*/
- if (iap->ia_size != i_size_read(inode))
- iap->ia_valid |= ATTR_MTIME;
- }
+ struct iattr size_attr = {
+ .ia_valid = ATTR_SIZE | ATTR_CTIME | ATTR_MTIME,
+ .ia_size = iap->ia_size,
+ };
- iap->ia_valid |= ATTR_CTIME;
+ host_err = notify_change(dentry, &size_attr, NULL);
+ if (host_err)
+ goto out_unlock;
+ iap->ia_valid &= ~ATTR_SIZE;
- if (check_guard && guardtime != inode->i_ctime.tv_sec) {
- err = nfserr_notsync;
- goto out_put_write_access;
+ /*
+ * Avoid the additional setattr call below if the only other
+ * attribute that the client sends is the mtime, as we update
+ * it as part of the size change above.
+ */
+ if ((iap->ia_valid & ~ATTR_MTIME) == 0)
+ goto out_unlock;
}
- fh_lock(fhp);
+ iap->ia_valid |= ATTR_CTIME;
host_err = notify_change(dentry, iap, NULL);
- fh_unlock(fhp);
- err = nfserrno(host_err);
-out_put_write_access:
+out_unlock:
+ fh_unlock(fhp);
if (size_change)
put_write_access(inode);
- if (!err)
- err = nfserrno(commit_metadata(fhp));
out:
- return err;
+ if (!host_err)
+ host_err = commit_metadata(fhp);
+ return nfserrno(host_err);
}
#if defined(CONFIG_NFSD_V4)
__be32
nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
loff_t offset, struct kvec *vec, int vlen,
- unsigned long *cnt, int *stablep)
+ unsigned long *cnt, int stable)
{
struct svc_export *exp;
- struct inode *inode;
mm_segment_t oldfs;
__be32 err = 0;
int host_err;
- int stable = *stablep;
int use_wgather;
loff_t pos = offset;
unsigned int pflags = current->flags;
*/
current->flags |= PF_LESS_THROTTLE;
- inode = file_inode(file);
- exp = fhp->fh_export;
-
+ exp = fhp->fh_export;
use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp);
if (!EX_ISSYNC(exp))
- stable = 0;
+ stable = NFS_UNSTABLE;
if (stable && !use_wgather)
flags |= RWF_SYNC;
* N.B. After this call fhp needs an fh_put
*/
__be32
-nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
- loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt,
- int *stablep)
+nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
+ struct kvec *vec, int vlen, unsigned long *cnt, int stable)
{
- __be32 err = 0;
+ struct file *file = NULL;
+ __be32 err = 0;
trace_write_start(rqstp, fhp, offset, vlen);
- if (file) {
- err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
- NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE);
- if (err)
- goto out;
- trace_write_opened(rqstp, fhp, offset, vlen);
- err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt,
- stablep);
- trace_write_io_done(rqstp, fhp, offset, vlen);
- } else {
- err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
- if (err)
- goto out;
+ err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
+ if (err)
+ goto out;
- trace_write_opened(rqstp, fhp, offset, vlen);
- if (cnt)
- err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen,
- cnt, stablep);
- trace_write_io_done(rqstp, fhp, offset, vlen);
- fput(file);
- }
+ trace_write_opened(rqstp, fhp, offset, vlen);
+ err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stable);
+ trace_write_io_done(rqstp, fhp, offset, vlen);
+ fput(file);
out:
trace_write_done(rqstp, fhp, offset, vlen);
return err;
unsigned long *);
__be32 nfsd_read(struct svc_rqst *, struct svc_fh *,
loff_t, struct kvec *, int, unsigned long *);
-__be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *,
- loff_t, struct kvec *,int, unsigned long *, int *);
+__be32 nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t,
+ struct kvec *, int, unsigned long *, int);
__be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct file *file, loff_t offset,
struct kvec *vec, int vlen, unsigned long *cnt,
- int *stablep);
+ int stable);
__be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *,
char *, int *);
__be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *,
* completion callback for async writepages
*/
typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
-typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool);
#define CEPH_HOMELESS_OSD -1
unsigned int r_num_ops;
int r_result;
- bool r_got_reply;
struct ceph_osd_client *r_osdc;
struct kref r_kref;
bool r_mempool;
- struct completion r_completion;
- struct completion r_done_completion; /* fsync waiter */
+ struct completion r_completion; /* private to osd_client.c */
ceph_osdc_callback_t r_callback;
- ceph_osdc_unsafe_callback_t r_unsafe_callback;
struct list_head r_unsafe_item;
struct inode *r_inode; /* for use by callbacks */
case CEPH_POOL_TYPE_EC:
return false;
default:
- BUG_ON(1);
+ BUG();
}
}
const struct ceph_object_locator *src);
void ceph_oloc_destroy(struct ceph_object_locator *oloc);
-/*
- * Maximum supported by kernel client object name length
- *
- * (probably outdated: must be >= RBD_MAX_MD_NAME_LEN -- currently 100)
- */
-#define CEPH_MAX_OID_NAME_LEN 100
-
/*
* 51-char inline_name is long enough for all cephfs and all but one
* rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be
* the list of osds that store+replicate them. */
struct crush_map *crush;
- struct mutex crush_scratch_mutex;
- int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
+ struct mutex crush_workspace_mutex;
+ void *crush_workspace;
};
static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
#define CEPH_PG_LAYOUT_LINEAR 2
#define CEPH_PG_LAYOUT_HYBRID 3
-#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */
+#define CEPH_PG_MAX_SIZE 32 /* max # osds in a single pg */
/*
* placement group.
#endif
#endif
+#ifdef CONFIG_STACK_VALIDATION
+#define annotate_unreachable() ({ \
+ asm("%c0:\t\n" \
+ ".pushsection __unreachable, \"a\"\t\n" \
+ ".long %c0b\t\n" \
+ ".popsection\t\n" : : "i" (__LINE__)); \
+})
+#else
+#define annotate_unreachable()
+#endif
+
/*
* Mark a position in code as unreachable. This can be used to
* suppress control flow warnings after asm blocks that transfer
* this in the preprocessor, but we can live with this because they're
* unreleased. Really, we need to have autoconf for the kernel.
*/
-#define unreachable() __builtin_unreachable()
+#define unreachable() \
+ do { annotate_unreachable(); __builtin_unreachable(); } while (0)
/* Mark a function definition as prohibited from being cloned. */
#define __noclone __attribute__((__noclone__, __optimize__("no-tracer")))
__u32 size; /* num items */
__s32 *items;
- /*
- * cached random permutation: used for uniform bucket and for
- * the linear search fallback for the other bucket types.
- */
- __u32 perm_x; /* @x for which *perm is defined */
- __u32 perm_n; /* num elements of *perm that are permuted/defined */
- __u32 *perm;
};
struct crush_bucket_uniform {
* device fails. */
__u8 chooseleaf_stable;
+ /*
+ * This value is calculated after decode or construction by
+ * the builder. It is exposed here (rather than having a
+ * 'build CRUSH working space' function) so that callers can
+ * reserve a static buffer, allocate space on the stack, or
+ * otherwise avoid calling into the heap allocator if they
+ * want to. The size of the working space depends on the map,
+ * while the size of the scratch vector passed to the mapper
+ * depends on the size of the desired result set.
+ *
+ * Nothing stops the caller from allocating both in one swell
+ * foop and passing in two points, though.
+ */
+ size_t working_size;
+
#ifndef __KERNEL__
/*
* version 0 (original) of straw_calc has various flaws. version 1
return ((i+1) << 1)-1;
}
+/*
+ * These data structures are private to the CRUSH implementation. They
+ * are exposed in this header file because builder needs their
+ * definitions to calculate the total working size.
+ *
+ * Moving this out of the crush map allow us to treat the CRUSH map as
+ * immutable within the mapper and removes the requirement for a CRUSH
+ * map lock.
+ */
+struct crush_work_bucket {
+ __u32 perm_x; /* @x for which *perm is defined */
+ __u32 perm_n; /* num elements of *perm that are permuted/defined */
+ __u32 *perm; /* Permutation of the bucket's items */
+};
+
+struct crush_work {
+ struct crush_work_bucket **work; /* Per-bucket working store */
+};
+
#endif
int ruleno,
int x, int *result, int result_max,
const __u32 *weights, int weight_max,
- int *scratch);
+ void *cwin);
+
+/*
+ * Returns the exact amount of workspace that will need to be used
+ * for a given combination of crush_map and result_max. The caller can
+ * then allocate this much on its own, either on the stack, in a
+ * per-thread long-lived buffer, or however it likes.
+ */
+static inline size_t crush_work_size(const struct crush_map *map,
+ int result_max)
+{
+ return map->working_size + result_max * 3 * sizeof(__u32);
+}
+
+void crush_init_workspace(const struct crush_map *map, void *v);
#endif
#ifndef _LINUX_REFCOUNT_H
#define _LINUX_REFCOUNT_H
-/*
- * Variant of atomic_t specialized for reference counts.
- *
- * The interface matches the atomic_t interface (to aid in porting) but only
- * provides the few functions one should use for reference counting.
- *
- * It differs in that the counter saturates at UINT_MAX and will not move once
- * there. This avoids wrapping the counter and causing 'spurious'
- * use-after-free issues.
- *
- * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
- * and provide only what is strictly required for refcounts.
- *
- * The increments are fully relaxed; these will not provide ordering. The
- * rationale is that whatever is used to obtain the object we're increasing the
- * reference count on will provide the ordering. For locked data structures,
- * its the lock acquire, for RCU/lockless data structures its the dependent
- * load.
- *
- * Do note that inc_not_zero() provides a control dependency which will order
- * future stores against the inc, this ensures we'll never modify the object
- * if we did not in fact acquire a reference.
- *
- * The decrements will provide release order, such that all the prior loads and
- * stores will be issued before, it also provides a control dependency, which
- * will order us against the subsequent free().
- *
- * The control dependency is against the load of the cmpxchg (ll/sc) that
- * succeeded. This means the stores aren't fully ordered, but this is fine
- * because the 1->0 transition indicates no concurrency.
- *
- * Note that the allocator is responsible for ordering things between free()
- * and alloc().
- *
- */
-
#include <linux/atomic.h>
-#include <linux/bug.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
-
-#ifdef CONFIG_DEBUG_REFCOUNT
-#define REFCOUNT_WARN(cond, str) WARN_ON(cond)
-#define __refcount_check __must_check
-#else
-#define REFCOUNT_WARN(cond, str) (void)(cond)
-#define __refcount_check
-#endif
+#include <linux/kernel.h>
typedef struct refcount_struct {
atomic_t refs;
return atomic_read(&r->refs);
}
-static inline __refcount_check
-bool refcount_add_not_zero(unsigned int i, refcount_t *r)
-{
- unsigned int old, new, val = atomic_read(&r->refs);
-
- for (;;) {
- if (!val)
- return false;
-
- if (unlikely(val == UINT_MAX))
- return true;
-
- new = val + i;
- if (new < val)
- new = UINT_MAX;
- old = atomic_cmpxchg_relaxed(&r->refs, val, new);
- if (old == val)
- break;
-
- val = old;
- }
-
- REFCOUNT_WARN(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n");
-
- return true;
-}
-
-static inline void refcount_add(unsigned int i, refcount_t *r)
-{
- REFCOUNT_WARN(!refcount_add_not_zero(i, r), "refcount_t: addition on 0; use-after-free.\n");
-}
-
-/*
- * Similar to atomic_inc_not_zero(), will saturate at UINT_MAX and WARN.
- *
- * Provides no memory ordering, it is assumed the caller has guaranteed the
- * object memory to be stable (RCU, etc.). It does provide a control dependency
- * and thereby orders future stores. See the comment on top.
- */
-static inline __refcount_check
-bool refcount_inc_not_zero(refcount_t *r)
-{
- unsigned int old, new, val = atomic_read(&r->refs);
-
- for (;;) {
- new = val + 1;
-
- if (!val)
- return false;
-
- if (unlikely(!new))
- return true;
-
- old = atomic_cmpxchg_relaxed(&r->refs, val, new);
- if (old == val)
- break;
-
- val = old;
- }
-
- REFCOUNT_WARN(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n");
-
- return true;
-}
-
-/*
- * Similar to atomic_inc(), will saturate at UINT_MAX and WARN.
- *
- * Provides no memory ordering, it is assumed the caller already has a
- * reference on the object, will WARN when this is not so.
- */
-static inline void refcount_inc(refcount_t *r)
-{
- REFCOUNT_WARN(!refcount_inc_not_zero(r), "refcount_t: increment on 0; use-after-free.\n");
-}
-
-/*
- * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
- * decrement when saturated at UINT_MAX.
- *
- * Provides release memory ordering, such that prior loads and stores are done
- * before, and provides a control dependency such that free() must come after.
- * See the comment on top.
- */
-static inline __refcount_check
-bool refcount_sub_and_test(unsigned int i, refcount_t *r)
-{
- unsigned int old, new, val = atomic_read(&r->refs);
-
- for (;;) {
- if (unlikely(val == UINT_MAX))
- return false;
-
- new = val - i;
- if (new > val) {
- REFCOUNT_WARN(new > val, "refcount_t: underflow; use-after-free.\n");
- return false;
- }
-
- old = atomic_cmpxchg_release(&r->refs, val, new);
- if (old == val)
- break;
-
- val = old;
- }
-
- return !new;
-}
-
-static inline __refcount_check
-bool refcount_dec_and_test(refcount_t *r)
-{
- return refcount_sub_and_test(1, r);
-}
+extern __must_check bool refcount_add_not_zero(unsigned int i, refcount_t *r);
+extern void refcount_add(unsigned int i, refcount_t *r);
-/*
- * Similar to atomic_dec(), it will WARN on underflow and fail to decrement
- * when saturated at UINT_MAX.
- *
- * Provides release memory ordering, such that prior loads and stores are done
- * before.
- */
-static inline
-void refcount_dec(refcount_t *r)
-{
- REFCOUNT_WARN(refcount_dec_and_test(r), "refcount_t: decrement hit 0; leaking memory.\n");
-}
-
-/*
- * No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the
- * success thereof.
- *
- * Like all decrement operations, it provides release memory order and provides
- * a control dependency.
- *
- * It can be used like a try-delete operator; this explicit case is provided
- * and not cmpxchg in generic, because that would allow implementing unsafe
- * operations.
- */
-static inline __refcount_check
-bool refcount_dec_if_one(refcount_t *r)
-{
- return atomic_cmpxchg_release(&r->refs, 1, 0) == 1;
-}
-
-/*
- * No atomic_t counterpart, it decrements unless the value is 1, in which case
- * it will return false.
- *
- * Was often done like: atomic_add_unless(&var, -1, 1)
- */
-static inline __refcount_check
-bool refcount_dec_not_one(refcount_t *r)
-{
- unsigned int old, new, val = atomic_read(&r->refs);
+extern __must_check bool refcount_inc_not_zero(refcount_t *r);
+extern void refcount_inc(refcount_t *r);
- for (;;) {
- if (unlikely(val == UINT_MAX))
- return true;
+extern __must_check bool refcount_sub_and_test(unsigned int i, refcount_t *r);
+extern void refcount_sub(unsigned int i, refcount_t *r);
- if (val == 1)
- return false;
+extern __must_check bool refcount_dec_and_test(refcount_t *r);
+extern void refcount_dec(refcount_t *r);
- new = val - 1;
- if (new > val) {
- REFCOUNT_WARN(new > val, "refcount_t: underflow; use-after-free.\n");
- return true;
- }
-
- old = atomic_cmpxchg_release(&r->refs, val, new);
- if (old == val)
- break;
-
- val = old;
- }
-
- return true;
-}
-
-/*
- * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail
- * to decrement when saturated at UINT_MAX.
- *
- * Provides release memory ordering, such that prior loads and stores are done
- * before, and provides a control dependency such that free() must come after.
- * See the comment on top.
- */
-static inline __refcount_check
-bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock)
-{
- if (refcount_dec_not_one(r))
- return false;
-
- mutex_lock(lock);
- if (!refcount_dec_and_test(r)) {
- mutex_unlock(lock);
- return false;
- }
-
- return true;
-}
-
-/*
- * Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to
- * decrement when saturated at UINT_MAX.
- *
- * Provides release memory ordering, such that prior loads and stores are done
- * before, and provides a control dependency such that free() must come after.
- * See the comment on top.
- */
-static inline __refcount_check
-bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock)
-{
- if (refcount_dec_not_one(r))
- return false;
-
- spin_lock(lock);
- if (!refcount_dec_and_test(r)) {
- spin_unlock(lock);
- return false;
- }
-
- return true;
-}
+extern __must_check bool refcount_dec_if_one(refcount_t *r);
+extern __must_check bool refcount_dec_not_one(refcount_t *r);
+extern __must_check bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock);
+extern __must_check bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock);
#endif /* _LINUX_REFCOUNT_H */
kref_put(&h->ref, cd->cache_put);
}
-static inline int cache_is_expired(struct cache_detail *detail, struct cache_head *h)
+static inline bool cache_is_expired(struct cache_detail *detail, struct cache_head *h)
{
+ if (!test_bit(CACHE_VALID, &h->flags))
+ return false;
+
return (h->expiry_time < seconds_since_boot()) ||
(detail->flush_time >= h->last_refresh);
}
extern int sunrpc_cache_register_pipefs(struct dentry *parent, const char *,
umode_t, struct cache_detail *);
extern void sunrpc_cache_unregister_pipefs(struct cache_detail *);
+extern void sunrpc_cache_unhash(struct cache_detail *, struct cache_head *);
/* Must store cache_detail in seq_file->private if using next three functions */
extern void *cache_seq_start(struct seq_file *file, loff_t *pos);
} rm_body;
};
+/*
+ * XDR sizes, in quads
+ */
+enum {
+ rpcrdma_fixed_maxsz = 4,
+ rpcrdma_segment_maxsz = 4,
+ rpcrdma_readchunk_maxsz = 2 + rpcrdma_segment_maxsz,
+};
+
/*
* Smallest RPC/RDMA header: rm_xid through rm_type, then rm_nochunks
*/
struct svc_procedure * vs_proc; /* per-procedure info */
u32 vs_xdrsize; /* xdrsize needed for this version */
- unsigned int vs_hidden : 1, /* Don't register with portmapper.
- * Only used for nfsacl so far. */
- vs_rpcb_optnl:1;/* Don't care the result of register.
- * Only used for nfsv4. */
+ /* Don't register with rpcbind */
+ bool vs_hidden;
+
+ /* Don't care if the rpcbind registration fails */
+ bool vs_rpcb_optnl;
+
+ /* Need xprt with congestion control */
+ bool vs_need_cong_ctrl;
/* Override dispatch function (e.g. when caching replies).
* A return value of 0 means drop the request.
* completes.
*/
struct svc_rdma_op_ctxt {
- struct list_head free;
+ struct list_head list;
struct svc_rdma_op_ctxt *read_hdr;
struct svc_rdma_fastreg_mr *frmr;
int hdr_count;
struct ib_cqe cqe;
struct ib_cqe reg_cqe;
struct ib_cqe inv_cqe;
- struct list_head dto_q;
u32 byte_len;
u32 position;
struct svcxprt_rdma *xprt;
atomic_t sc_sq_avail; /* SQEs ready to be consumed */
unsigned int sc_sq_depth; /* Depth of SQ */
unsigned int sc_rq_depth; /* Depth of RQ */
- u32 sc_max_requests; /* Forward credits */
+ __be32 sc_fc_credits; /* Forward credits */
+ u32 sc_max_requests; /* Max requests */
u32 sc_max_bc_requests;/* Backward credits */
int sc_max_req_size; /* Size of each RQ WR buf */
wait_queue_head_t sc_send_wait; /* SQ exhaustion waitlist */
unsigned long sc_flags;
- struct list_head sc_dto_q; /* DTO tasklet I/O pending Q */
struct list_head sc_read_complete_q;
struct work_struct sc_work;
};
extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int);
extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int,
__be32, __be64, u32);
-extern void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *,
- struct rpcrdma_msg *,
- struct rpcrdma_msg *,
- enum rpcrdma_proc);
-extern int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *);
+extern unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp);
/* svc_rdma_recvfrom.c */
extern int svc_rdma_recvfrom(struct svc_rqst *);
#define XPT_CACHE_AUTH 11 /* cache auth info */
#define XPT_LOCAL 12 /* connection from loopback interface */
#define XPT_KILL_TEMP 13 /* call xpo_kill_temp_xprt before closing */
+#define XPT_CONG_CTRL 14 /* has congestion control */
struct svc_serv *xpt_server; /* service for transport */
atomic_t xpt_reserved; /* space on outq that is rsvd */
#include <linux/types.h>
#include <linux/compiler.h>
-#include <linux/sysctl.h>
#include <linux/in.h>
#include <linux/in6.h>
#define _UAPI_XT_HASHLIMIT_H
#include <linux/types.h>
+#include <linux/limits.h>
#include <linux/if.h>
/* timings are in milliseconds. */
#define NFSEXP_ASYNC 0x0010
#define NFSEXP_GATHERED_WRITES 0x0020
#define NFSEXP_NOREADDIRPLUS 0x0040
-/* 80 100 currently unused */
+#define NFSEXP_SECURITY_LABEL 0x0080
+/* 0x100 currently unused */
#define NFSEXP_NOHIDE 0x0200
#define NFSEXP_NOSUBTREECHECK 0x0400
#define NFSEXP_NOAUTHNLM 0x0800 /* Don't authenticate NLM requests - just trust */
#define NFSEXP_PNFS 0x20000
/* All flags that we claim to support. (Note we don't support NOACL.) */
-#define NFSEXP_ALLFLAGS 0x3FE7F
+#define NFSEXP_ALLFLAGS 0x3FEFF
/* The flags that may vary depending on security flavor: */
#define NFSEXP_SECINFO_FLAGS (NFSEXP_READONLY | NFSEXP_ROOTSQUASH \
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
- int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write)
return ret;
if (enabled) {
clone_ctx = unclone_ctx(ctx);
ctx_resched(cpuctx, ctx, event_type);
+ } else {
+ ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
}
perf_ctx_unlock(cpuctx, ctx);
* of swizzling perf_event::ctx.
*/
perf_remove_from_context(group_leader, 0);
+ put_ctx(gctx);
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
perf_event__state_init(group_leader);
perf_install_in_context(ctx, group_leader, group_leader->cpu);
get_ctx(ctx);
-
- /*
- * Now that all events are installed in @ctx, nothing
- * references @gctx anymore, so drop the last reference we have
- * on it.
- */
- put_ctx(gctx);
}
/*
int ret = 0;
rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
if (p->flags & PF_KTHREAD) {
/*
{
struct rq *rq = dead_rq;
struct task_struct *next, *stop = rq->stop;
- struct rq_flags rf, old_rf;
+ struct rq_flags rf;
int dest_cpu;
/*
* class method both need to have an up-to-date
* value of rq->clock[_task]
*/
+ rq_pin_lock(rq, &rf);
update_rq_clock(rq);
+ rq_unpin_lock(rq, &rf);
for (;;) {
/*
/*
* pick_next_task() assumes pinned rq->lock:
*/
- rq_pin_lock(rq, &rf);
+ rq_repin_lock(rq, &rf);
next = pick_next_task(rq, &fake_task, &rf);
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
continue;
}
- /*
- * __migrate_task() may return with a different
- * rq->lock held and a new cookie in 'rf', but we need
- * to preserve rf::clock_update_flags for 'dead_rq'.
- */
- old_rf = rf;
-
/* Find suitable destination for @next, with force if needed. */
dest_cpu = select_fallback_rq(dead_rq->cpu, next);
raw_spin_unlock(&rq->lock);
rq = dead_rq;
raw_spin_lock(&rq->lock);
- rf = old_rf;
}
raw_spin_unlock(&next->pi_lock);
}
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);
- sched_online_group(tg, parent);
-
return &tg->css;
}
+/* Expose task group only after completing cgroup initialization */
+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+ struct task_group *parent = css_tg(css->parent);
+
+ if (parent)
+ sched_online_group(tg, parent);
+ return 0;
+}
+
static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
+ .css_online = cpu_cgroup_css_online,
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.fork = cpu_cgroup_fork,
bool
config PARMAN
- tristate
+ tristate "parman" if COMPILE_TEST
config PRIME_NUMBERS
tristate
source "lib/Kconfig.kasan"
-config DEBUG_REFCOUNT
- bool "Verbose refcount checks"
- help
- Say Y here if you want reference counters (refcount_t and kref) to
- generate WARNs on dubious usage. Without this refcount_t will still
- be a saturating counter and avoid Use-After-Free by turning it into
- a resource leak Denial-Of-Service.
-
- Use of this option will increase kernel text size but will alert the
- admin of potential abuse.
-
- If in doubt, say "N".
-
endmenu # "Memory Debugging"
config ARCH_HAS_KCOV
gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
bsearch.o find_bit.o llist.o memweight.o kfifo.o \
percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o \
- once.o
+ once.o refcount.o
obj-y += string_helpers.o
obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
obj-y += hexdump.o
--- /dev/null
+/*
+ * Variant of atomic_t specialized for reference counts.
+ *
+ * The interface matches the atomic_t interface (to aid in porting) but only
+ * provides the few functions one should use for reference counting.
+ *
+ * It differs in that the counter saturates at UINT_MAX and will not move once
+ * there. This avoids wrapping the counter and causing 'spurious'
+ * use-after-free issues.
+ *
+ * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
+ * and provide only what is strictly required for refcounts.
+ *
+ * The increments are fully relaxed; these will not provide ordering. The
+ * rationale is that whatever is used to obtain the object we're increasing the
+ * reference count on will provide the ordering. For locked data structures,
+ * its the lock acquire, for RCU/lockless data structures its the dependent
+ * load.
+ *
+ * Do note that inc_not_zero() provides a control dependency which will order
+ * future stores against the inc, this ensures we'll never modify the object
+ * if we did not in fact acquire a reference.
+ *
+ * The decrements will provide release order, such that all the prior loads and
+ * stores will be issued before, it also provides a control dependency, which
+ * will order us against the subsequent free().
+ *
+ * The control dependency is against the load of the cmpxchg (ll/sc) that
+ * succeeded. This means the stores aren't fully ordered, but this is fine
+ * because the 1->0 transition indicates no concurrency.
+ *
+ * Note that the allocator is responsible for ordering things between free()
+ * and alloc().
+ *
+ */
+
+#include <linux/refcount.h>
+#include <linux/bug.h>
+
+bool refcount_add_not_zero(unsigned int i, refcount_t *r)
+{
+ unsigned int old, new, val = atomic_read(&r->refs);
+
+ for (;;) {
+ if (!val)
+ return false;
+
+ if (unlikely(val == UINT_MAX))
+ return true;
+
+ new = val + i;
+ if (new < val)
+ new = UINT_MAX;
+ old = atomic_cmpxchg_relaxed(&r->refs, val, new);
+ if (old == val)
+ break;
+
+ val = old;
+ }
+
+ WARN(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n");
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(refcount_add_not_zero);
+
+void refcount_add(unsigned int i, refcount_t *r)
+{
+ WARN(!refcount_add_not_zero(i, r), "refcount_t: addition on 0; use-after-free.\n");
+}
+EXPORT_SYMBOL_GPL(refcount_add);
+
+/*
+ * Similar to atomic_inc_not_zero(), will saturate at UINT_MAX and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller has guaranteed the
+ * object memory to be stable (RCU, etc.). It does provide a control dependency
+ * and thereby orders future stores. See the comment on top.
+ */
+bool refcount_inc_not_zero(refcount_t *r)
+{
+ unsigned int old, new, val = atomic_read(&r->refs);
+
+ for (;;) {
+ new = val + 1;
+
+ if (!val)
+ return false;
+
+ if (unlikely(!new))
+ return true;
+
+ old = atomic_cmpxchg_relaxed(&r->refs, val, new);
+ if (old == val)
+ break;
+
+ val = old;
+ }
+
+ WARN(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n");
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(refcount_inc_not_zero);
+
+/*
+ * Similar to atomic_inc(), will saturate at UINT_MAX and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller already has a
+ * reference on the object, will WARN when this is not so.
+ */
+void refcount_inc(refcount_t *r)
+{
+ WARN(!refcount_inc_not_zero(r), "refcount_t: increment on 0; use-after-free.\n");
+}
+EXPORT_SYMBOL_GPL(refcount_inc);
+
+bool refcount_sub_and_test(unsigned int i, refcount_t *r)
+{
+ unsigned int old, new, val = atomic_read(&r->refs);
+
+ for (;;) {
+ if (unlikely(val == UINT_MAX))
+ return false;
+
+ new = val - i;
+ if (new > val) {
+ WARN(new > val, "refcount_t: underflow; use-after-free.\n");
+ return false;
+ }
+
+ old = atomic_cmpxchg_release(&r->refs, val, new);
+ if (old == val)
+ break;
+
+ val = old;
+ }
+
+ return !new;
+}
+EXPORT_SYMBOL_GPL(refcount_sub_and_test);
+
+/*
+ * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
+ * decrement when saturated at UINT_MAX.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides a control dependency such that free() must come after.
+ * See the comment on top.
+ */
+bool refcount_dec_and_test(refcount_t *r)
+{
+ return refcount_sub_and_test(1, r);
+}
+EXPORT_SYMBOL_GPL(refcount_dec_and_test);
+
+/*
+ * Similar to atomic_dec(), it will WARN on underflow and fail to decrement
+ * when saturated at UINT_MAX.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before.
+ */
+
+void refcount_dec(refcount_t *r)
+{
+ WARN(refcount_dec_and_test(r), "refcount_t: decrement hit 0; leaking memory.\n");
+}
+EXPORT_SYMBOL_GPL(refcount_dec);
+
+/*
+ * No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the
+ * success thereof.
+ *
+ * Like all decrement operations, it provides release memory order and provides
+ * a control dependency.
+ *
+ * It can be used like a try-delete operator; this explicit case is provided
+ * and not cmpxchg in generic, because that would allow implementing unsafe
+ * operations.
+ */
+bool refcount_dec_if_one(refcount_t *r)
+{
+ return atomic_cmpxchg_release(&r->refs, 1, 0) == 1;
+}
+EXPORT_SYMBOL_GPL(refcount_dec_if_one);
+
+/*
+ * No atomic_t counterpart, it decrements unless the value is 1, in which case
+ * it will return false.
+ *
+ * Was often done like: atomic_add_unless(&var, -1, 1)
+ */
+bool refcount_dec_not_one(refcount_t *r)
+{
+ unsigned int old, new, val = atomic_read(&r->refs);
+
+ for (;;) {
+ if (unlikely(val == UINT_MAX))
+ return true;
+
+ if (val == 1)
+ return false;
+
+ new = val - 1;
+ if (new > val) {
+ WARN(new > val, "refcount_t: underflow; use-after-free.\n");
+ return true;
+ }
+
+ old = atomic_cmpxchg_release(&r->refs, val, new);
+ if (old == val)
+ break;
+
+ val = old;
+ }
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(refcount_dec_not_one);
+
+/*
+ * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail
+ * to decrement when saturated at UINT_MAX.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides a control dependency such that free() must come after.
+ * See the comment on top.
+ */
+bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock)
+{
+ if (refcount_dec_not_one(r))
+ return false;
+
+ mutex_lock(lock);
+ if (!refcount_dec_and_test(r)) {
+ mutex_unlock(lock);
+ return false;
+ }
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(refcount_dec_and_mutex_lock);
+
+/*
+ * Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to
+ * decrement when saturated at UINT_MAX.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides a control dependency such that free() must come after.
+ * See the comment on top.
+ */
+bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock)
+{
+ if (refcount_dec_not_one(r))
+ return false;
+
+ spin_lock(lock);
+ if (!refcount_dec_and_test(r)) {
+ spin_unlock(lock);
+ return false;
+ }
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(refcount_dec_and_lock);
+
if (tbl->nest)
nested_bucket_table_free(tbl);
- if (tbl)
- kvfree(tbl->locks);
-
+ kvfree(tbl->locks);
kvfree(tbl);
}
union nested_table *ntbl;
ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]);
- ntbl = rht_dereference_bucket(ntbl[index].table, tbl, hash);
+ ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash);
subhash >>= tbl->nest;
while (ntbl && size > (1 << shift)) {
index = subhash & ((1 << shift) - 1);
- ntbl = rht_dereference_bucket(ntbl[index].table, tbl, hash);
+ ntbl = rht_dereference_bucket_rcu(ntbl[index].table,
+ tbl, hash);
size >>= shift;
subhash >>= shift;
}
last_priority = item->prio->priority;
if (item->parman_item.index != i) {
- pr_err("Item has different index in compare to where it actualy is (%lu != %d)\n",
+ pr_err("Item has different index in compare to where it actually is (%lu != %d)\n",
item->parman_item.index, i);
return -EINVAL;
}
dout("%s lock_name %s type %d cookie %s tag %s desc %s flags 0x%x\n",
__func__, lock_name, type, cookie, tag, desc, flags);
ret = ceph_osdc_call(osdc, oid, oloc, "lock", "lock",
- CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
- lock_op_page, lock_op_buf_size, NULL, NULL);
+ CEPH_OSD_FLAG_WRITE, lock_op_page,
+ lock_op_buf_size, NULL, NULL);
dout("%s: status %d\n", __func__, ret);
__free_page(lock_op_page);
dout("%s lock_name %s cookie %s\n", __func__, lock_name, cookie);
ret = ceph_osdc_call(osdc, oid, oloc, "lock", "unlock",
- CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
- unlock_op_page, unlock_op_buf_size, NULL, NULL);
+ CEPH_OSD_FLAG_WRITE, unlock_op_page,
+ unlock_op_buf_size, NULL, NULL);
dout("%s: status %d\n", __func__, ret);
__free_page(unlock_op_page);
dout("%s lock_name %s cookie %s locker %s%llu\n", __func__, lock_name,
cookie, ENTITY_NAME(*locker));
ret = ceph_osdc_call(osdc, oid, oloc, "lock", "break_lock",
- CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
- break_op_page, break_op_buf_size, NULL, NULL);
+ CEPH_OSD_FLAG_WRITE, break_op_page,
+ break_op_buf_size, NULL, NULL);
dout("%s: status %d\n", __func__, ret);
__free_page(break_op_page);
int get_info_op_buf_size;
int name_len = strlen(lock_name);
struct page *get_info_op_page, *reply_page;
- size_t reply_len;
+ size_t reply_len = PAGE_SIZE;
void *p, *end;
int ret;
void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
{
- kfree(b->h.perm);
kfree(b->h.items);
kfree(b);
}
{
kfree(b->item_weights);
kfree(b->sum_weights);
- kfree(b->h.perm);
kfree(b->h.items);
kfree(b);
}
void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
{
- kfree(b->h.perm);
kfree(b->h.items);
kfree(b->node_weights);
kfree(b);
{
kfree(b->straws);
kfree(b->item_weights);
- kfree(b->h.perm);
kfree(b->h.items);
kfree(b);
}
void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b)
{
kfree(b->item_weights);
- kfree(b->h.perm);
kfree(b->h.items);
kfree(b);
}
return -1;
}
-
/*
* bucket choose methods
*
* Since this is expensive, we optimize for the r=0 case, which
* captures the vast majority of calls.
*/
-static int bucket_perm_choose(struct crush_bucket *bucket,
+static int bucket_perm_choose(const struct crush_bucket *bucket,
+ struct crush_work_bucket *work,
int x, int r)
{
unsigned int pr = r % bucket->size;
unsigned int i, s;
/* start a new permutation if @x has changed */
- if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) {
+ if (work->perm_x != (__u32)x || work->perm_n == 0) {
dprintk("bucket %d new x=%d\n", bucket->id, x);
- bucket->perm_x = x;
+ work->perm_x = x;
/* optimize common r=0 case */
if (pr == 0) {
s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
bucket->size;
- bucket->perm[0] = s;
- bucket->perm_n = 0xffff; /* magic value, see below */
+ work->perm[0] = s;
+ work->perm_n = 0xffff; /* magic value, see below */
goto out;
}
for (i = 0; i < bucket->size; i++)
- bucket->perm[i] = i;
- bucket->perm_n = 0;
- } else if (bucket->perm_n == 0xffff) {
+ work->perm[i] = i;
+ work->perm_n = 0;
+ } else if (work->perm_n == 0xffff) {
/* clean up after the r=0 case above */
for (i = 1; i < bucket->size; i++)
- bucket->perm[i] = i;
- bucket->perm[bucket->perm[0]] = 0;
- bucket->perm_n = 1;
+ work->perm[i] = i;
+ work->perm[work->perm[0]] = 0;
+ work->perm_n = 1;
}
/* calculate permutation up to pr */
- for (i = 0; i < bucket->perm_n; i++)
- dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
- while (bucket->perm_n <= pr) {
- unsigned int p = bucket->perm_n;
+ for (i = 0; i < work->perm_n; i++)
+ dprintk(" perm_choose have %d: %d\n", i, work->perm[i]);
+ while (work->perm_n <= pr) {
+ unsigned int p = work->perm_n;
/* no point in swapping the final entry */
if (p < bucket->size - 1) {
i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
(bucket->size - p);
if (i) {
- unsigned int t = bucket->perm[p + i];
- bucket->perm[p + i] = bucket->perm[p];
- bucket->perm[p] = t;
+ unsigned int t = work->perm[p + i];
+ work->perm[p + i] = work->perm[p];
+ work->perm[p] = t;
}
dprintk(" perm_choose swap %d with %d\n", p, p+i);
}
- bucket->perm_n++;
+ work->perm_n++;
}
for (i = 0; i < bucket->size; i++)
- dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]);
+ dprintk(" perm_choose %d: %d\n", i, work->perm[i]);
- s = bucket->perm[pr];
+ s = work->perm[pr];
out:
dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
bucket->size, x, r, pr, s);
}
/* uniform */
-static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
- int x, int r)
+static int bucket_uniform_choose(const struct crush_bucket_uniform *bucket,
+ struct crush_work_bucket *work, int x, int r)
{
- return bucket_perm_choose(&bucket->h, x, r);
+ return bucket_perm_choose(&bucket->h, work, x, r);
}
/* list */
-static int bucket_list_choose(struct crush_bucket_list *bucket,
+static int bucket_list_choose(const struct crush_bucket_list *bucket,
int x, int r)
{
int i;
w *= bucket->sum_weights[i];
w = w >> 16;
/*dprintk(" scaled %llx\n", w);*/
- if (w < bucket->item_weights[i])
+ if (w < bucket->item_weights[i]) {
return bucket->h.items[i];
+ }
}
dprintk("bad list sums for bucket %d\n", bucket->h.id);
return x & 1;
}
-static int bucket_tree_choose(struct crush_bucket_tree *bucket,
+static int bucket_tree_choose(const struct crush_bucket_tree *bucket,
int x, int r)
{
int n;
/* straw */
-static int bucket_straw_choose(struct crush_bucket_straw *bucket,
+static int bucket_straw_choose(const struct crush_bucket_straw *bucket,
int x, int r)
{
__u32 i;
*
*/
-static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
+static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
int x, int r)
{
unsigned int i, high = 0;
high_draw = draw;
}
}
+
return bucket->h.items[high];
}
-static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
+static int crush_bucket_choose(const struct crush_bucket *in,
+ struct crush_work_bucket *work,
+ int x, int r)
{
dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
BUG_ON(in->size == 0);
switch (in->alg) {
case CRUSH_BUCKET_UNIFORM:
- return bucket_uniform_choose((struct crush_bucket_uniform *)in,
- x, r);
+ return bucket_uniform_choose(
+ (const struct crush_bucket_uniform *)in,
+ work, x, r);
case CRUSH_BUCKET_LIST:
- return bucket_list_choose((struct crush_bucket_list *)in,
+ return bucket_list_choose((const struct crush_bucket_list *)in,
x, r);
case CRUSH_BUCKET_TREE:
- return bucket_tree_choose((struct crush_bucket_tree *)in,
+ return bucket_tree_choose((const struct crush_bucket_tree *)in,
x, r);
case CRUSH_BUCKET_STRAW:
- return bucket_straw_choose((struct crush_bucket_straw *)in,
- x, r);
+ return bucket_straw_choose(
+ (const struct crush_bucket_straw *)in,
+ x, r);
case CRUSH_BUCKET_STRAW2:
- return bucket_straw2_choose((struct crush_bucket_straw2 *)in,
- x, r);
+ return bucket_straw2_choose(
+ (const struct crush_bucket_straw2 *)in,
+ x, r);
default:
dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
return in->items[0];
}
}
-
/*
* true if device is marked "out" (failed, fully offloaded)
* of the cluster
* @parent_r: r value passed from the parent
*/
static int crush_choose_firstn(const struct crush_map *map,
- struct crush_bucket *bucket,
+ struct crush_work *work,
+ const struct crush_bucket *bucket,
const __u32 *weight, int weight_max,
int x, int numrep, int type,
int *out, int outpos,
int rep;
unsigned int ftotal, flocal;
int retry_descent, retry_bucket, skip_rep;
- struct crush_bucket *in = bucket;
+ const struct crush_bucket *in = bucket;
int r;
int i;
int item = 0;
if (local_fallback_retries > 0 &&
flocal >= (in->size>>1) &&
flocal > local_fallback_retries)
- item = bucket_perm_choose(in, x, r);
+ item = bucket_perm_choose(
+ in, work->work[-1-in->id],
+ x, r);
else
- item = crush_bucket_choose(in, x, r);
+ item = crush_bucket_choose(
+ in, work->work[-1-in->id],
+ x, r);
if (item >= map->max_devices) {
dprintk(" bad item %d\n", item);
skip_rep = 1;
sub_r = r >> (vary_r-1);
else
sub_r = 0;
- if (crush_choose_firstn(map,
- map->buckets[-1-item],
- weight, weight_max,
- x, stable ? 1 : outpos+1, 0,
- out2, outpos, count,
- recurse_tries, 0,
- local_retries,
- local_fallback_retries,
- 0,
- vary_r,
- stable,
- NULL,
- sub_r) <= outpos)
+ if (crush_choose_firstn(
+ map,
+ work,
+ map->buckets[-1-item],
+ weight, weight_max,
+ x, stable ? 1 : outpos+1, 0,
+ out2, outpos, count,
+ recurse_tries, 0,
+ local_retries,
+ local_fallback_retries,
+ 0,
+ vary_r,
+ stable,
+ NULL,
+ sub_r) <= outpos)
/* didn't get leaf */
reject = 1;
} else {
}
}
- if (!reject) {
+ if (!reject && !collide) {
/* out? */
if (itemtype == 0)
reject = is_out(map, weight,
weight_max,
item, x);
- else
- reject = 0;
}
reject:
*
*/
static void crush_choose_indep(const struct crush_map *map,
- struct crush_bucket *bucket,
+ struct crush_work *work,
+ const struct crush_bucket *bucket,
const __u32 *weight, int weight_max,
int x, int left, int numrep, int type,
int *out, int outpos,
int *out2,
int parent_r)
{
- struct crush_bucket *in = bucket;
+ const struct crush_bucket *in = bucket;
int endpos = outpos + left;
int rep;
unsigned int ftotal;
break;
}
- item = crush_bucket_choose(in, x, r);
+ item = crush_bucket_choose(
+ in, work->work[-1-in->id],
+ x, r);
if (item >= map->max_devices) {
dprintk(" bad item %d\n", item);
out[rep] = CRUSH_ITEM_NONE;
if (recurse_to_leaf) {
if (item < 0) {
- crush_choose_indep(map,
- map->buckets[-1-item],
- weight, weight_max,
- x, 1, numrep, 0,
- out2, rep,
- recurse_tries, 0,
- 0, NULL, r);
+ crush_choose_indep(
+ map,
+ work,
+ map->buckets[-1-item],
+ weight, weight_max,
+ x, 1, numrep, 0,
+ out2, rep,
+ recurse_tries, 0,
+ 0, NULL, r);
if (out2[rep] == CRUSH_ITEM_NONE) {
/* placed nothing; no leaf */
break;
#endif
}
+
+/*
+ * This takes a chunk of memory and sets it up to be a shiny new
+ * working area for a CRUSH placement computation. It must be called
+ * on any newly allocated memory before passing it in to
+ * crush_do_rule. It may be used repeatedly after that, so long as the
+ * map has not changed. If the map /has/ changed, you must make sure
+ * the working size is no smaller than what was allocated and re-run
+ * crush_init_workspace.
+ *
+ * If you do retain the working space between calls to crush, make it
+ * thread-local.
+ */
+void crush_init_workspace(const struct crush_map *map, void *v)
+{
+ struct crush_work *w = v;
+ __s32 b;
+
+ /*
+ * We work by moving through the available space and setting
+ * values and pointers as we go.
+ *
+ * It's a bit like Forth's use of the 'allot' word since we
+ * set the pointer first and then reserve the space for it to
+ * point to by incrementing the point.
+ */
+ v += sizeof(struct crush_work *);
+ w->work = v;
+ v += map->max_buckets * sizeof(struct crush_work_bucket *);
+ for (b = 0; b < map->max_buckets; ++b) {
+ if (!map->buckets[b])
+ continue;
+
+ w->work[b] = v;
+ switch (map->buckets[b]->alg) {
+ default:
+ v += sizeof(struct crush_work_bucket);
+ break;
+ }
+ w->work[b]->perm_x = 0;
+ w->work[b]->perm_n = 0;
+ w->work[b]->perm = v;
+ v += map->buckets[b]->size * sizeof(__u32);
+ }
+ BUG_ON(v - (void *)w != map->working_size);
+}
+
/**
* crush_do_rule - calculate a mapping with the given input and rule
* @map: the crush_map
* @result_max: maximum result size
* @weight: weight vector (for map leaves)
* @weight_max: size of weight vector
- * @scratch: scratch vector for private use; must be >= 3 * result_max
+ * @cwin: pointer to at least crush_work_size() bytes of memory
*/
int crush_do_rule(const struct crush_map *map,
int ruleno, int x, int *result, int result_max,
const __u32 *weight, int weight_max,
- int *scratch)
+ void *cwin)
{
int result_len;
- int *a = scratch;
- int *b = scratch + result_max;
- int *c = scratch + result_max*2;
+ struct crush_work *cw = cwin;
+ int *a = cwin + map->working_size;
+ int *b = a + result_max;
+ int *c = b + result_max;
+ int *w = a;
+ int *o = b;
int recurse_to_leaf;
- int *w;
int wsize = 0;
- int *o;
int osize;
int *tmp;
- struct crush_rule *rule;
+ const struct crush_rule *rule;
__u32 step;
int i, j;
int numrep;
rule = map->rules[ruleno];
result_len = 0;
- w = a;
- o = b;
for (step = 0; step < rule->len; step++) {
int firstn = 0;
- struct crush_rule_step *curstep = &rule->steps[step];
+ const struct crush_rule_step *curstep = &rule->steps[step];
switch (curstep->op) {
case CRUSH_RULE_TAKE:
recurse_tries = choose_tries;
osize += crush_choose_firstn(
map,
+ cw,
map->buckets[bno],
weight, weight_max,
x, numrep,
numrep : (result_max-osize));
crush_choose_indep(
map,
+ cw,
map->buckets[bno],
weight, weight_max,
x, out_size, numrep,
break;
}
}
+
return result_len;
}
#include <linux/err.h>
#include <linux/scatterlist.h>
+#include <linux/sched.h>
#include <linux/slab.h>
#include <crypto/aes.h>
#include <crypto/skcipher.h>
kref_init(&req->r_kref);
init_completion(&req->r_completion);
- init_completion(&req->r_done_completion);
RB_CLEAR_NODE(&req->r_node);
RB_CLEAR_NODE(&req->r_mc_node);
INIT_LIST_HEAD(&req->r_unsafe_item);
BUG_ON(length > previous);
op->extent.length = length;
- op->indata_len -= previous - length;
+ if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL)
+ op->indata_len -= previous - length;
}
EXPORT_SYMBOL(osd_req_op_extent_update);
bool need_send = false;
bool promoted = false;
- WARN_ON(req->r_tid || req->r_got_reply);
+ WARN_ON(req->r_tid);
dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
again:
static void account_request(struct ceph_osd_request *req)
{
- unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
+ WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK));
+ WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE)));
- if (req->r_flags & CEPH_OSD_FLAG_READ) {
- WARN_ON(req->r_flags & mask);
- req->r_flags |= CEPH_OSD_FLAG_ACK;
- } else if (req->r_flags & CEPH_OSD_FLAG_WRITE)
- WARN_ON(!(req->r_flags & mask));
- else
- WARN_ON(1);
-
- WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask);
+ req->r_flags |= CEPH_OSD_FLAG_ONDISK;
atomic_inc(&req->r_osdc->num_requests);
}
static void __complete_request(struct ceph_osd_request *req)
{
- if (req->r_callback)
+ if (req->r_callback) {
+ dout("%s req %p tid %llu cb %pf result %d\n", __func__, req,
+ req->r_tid, req->r_callback, req->r_result);
req->r_callback(req);
- else
- complete_all(&req->r_completion);
+ }
}
/*
- * Note that this is open-coded in handle_reply(), which has to deal
- * with ack vs commit, dup acks, etc.
+ * This is open-coded in handle_reply().
*/
static void complete_request(struct ceph_osd_request *req, int err)
{
req->r_result = err;
finish_request(req);
__complete_request(req);
- complete_all(&req->r_done_completion);
+ complete_all(&req->r_completion);
ceph_osdc_put_request(req);
}
cancel_map_check(req);
finish_request(req);
- complete_all(&req->r_done_completion);
+ complete_all(&req->r_completion);
ceph_osdc_put_request(req);
}
mutex_lock(&lreq->lock);
dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq,
lreq->linger_id, req->r_result);
- WARN_ON(!__linger_registered(lreq));
linger_reg_commit_complete(lreq, req->r_result);
lreq->committed = true;
}
/*
- * We are done with @req if
- * - @m is a safe reply, or
- * - @m is an unsafe reply and we didn't want a safe one
- */
-static bool done_request(const struct ceph_osd_request *req,
- const struct MOSDOpReply *m)
-{
- return (m->result < 0 ||
- (m->flags & CEPH_OSD_FLAG_ONDISK) ||
- !(req->r_flags & CEPH_OSD_FLAG_ONDISK));
-}
-
-/*
- * handle osd op reply. either call the callback if it is specified,
- * or do the completion to wake up the waiting thread.
- *
- * ->r_unsafe_callback is set? yes no
- *
- * first reply is OK (needed r_cb/r_completion, r_cb/r_completion,
- * any or needed/got safe) r_done_completion r_done_completion
- *
- * first reply is unsafe r_unsafe_cb(true) (nothing)
- *
- * when we get the safe reply r_unsafe_cb(false), r_cb/r_completion,
- * r_done_completion r_done_completion
+ * Handle MOSDOpReply. Set ->r_result and call the callback if it is
+ * specified.
*/
static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
{
struct MOSDOpReply m;
u64 tid = le64_to_cpu(msg->hdr.tid);
u32 data_len = 0;
- bool already_acked;
int ret;
int i;
le32_to_cpu(msg->hdr.data_len), req->r_tid);
goto fail_request;
}
- dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__,
- req, req->r_tid, req->r_got_reply, m.result, data_len);
-
- already_acked = req->r_got_reply;
- if (!already_acked) {
- req->r_result = m.result ?: data_len;
- req->r_replay_version = m.replay_version; /* struct */
- req->r_got_reply = true;
- } else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) {
- dout("req %p tid %llu dup ack\n", req, req->r_tid);
- goto out_unlock_session;
- }
-
- if (done_request(req, &m)) {
- finish_request(req);
- if (req->r_linger) {
- WARN_ON(req->r_unsafe_callback);
- dout("req %p tid %llu cb (locked)\n", req, req->r_tid);
- __complete_request(req);
- }
- }
+ dout("%s req %p tid %llu result %d data_len %u\n", __func__,
+ req, req->r_tid, m.result, data_len);
+ /*
+ * Since we only ever request ONDISK, we should only ever get
+ * one (type of) reply back.
+ */
+ WARN_ON(!(m.flags & CEPH_OSD_FLAG_ONDISK));
+ req->r_result = m.result ?: data_len;
+ finish_request(req);
mutex_unlock(&osd->lock);
up_read(&osdc->lock);
- if (done_request(req, &m)) {
- if (already_acked && req->r_unsafe_callback) {
- dout("req %p tid %llu safe-cb\n", req, req->r_tid);
- req->r_unsafe_callback(req, false);
- } else if (!req->r_linger) {
- dout("req %p tid %llu cb\n", req, req->r_tid);
- __complete_request(req);
- }
- complete_all(&req->r_done_completion);
- ceph_osdc_put_request(req);
- } else {
- if (req->r_unsafe_callback) {
- dout("req %p tid %llu unsafe-cb\n", req, req->r_tid);
- req->r_unsafe_callback(req, true);
- } else {
- WARN_ON(1);
- }
- }
-
+ __complete_request(req);
+ complete_all(&req->r_completion);
+ ceph_osdc_put_request(req);
return;
fail_request:
up_read(&osdc->lock);
dout("%s waiting on req %p tid %llu last_tid %llu\n",
__func__, req, req->r_tid, last_tid);
- wait_for_completion(&req->r_done_completion);
+ wait_for_completion(&req->r_completion);
ceph_osdc_put_request(req);
goto again;
}
ceph_oid_copy(&lreq->t.base_oid, oid);
ceph_oloc_copy(&lreq->t.base_oloc, oloc);
- lreq->t.flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+ lreq->t.flags = CEPH_OSD_FLAG_WRITE;
lreq->mtime = CURRENT_TIME;
lreq->reg_req = alloc_linger_request(lreq);
ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
- req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+ req->r_flags = CEPH_OSD_FLAG_WRITE;
req->r_mtime = CURRENT_TIME;
osd_req_op_watch_init(req, 0, lreq->linger_id,
CEPH_OSD_WATCH_OP_UNWATCH);
* Execute an OSD class method on an object.
*
* @flags: CEPH_OSD_FLAG_*
- * @resp_len: out param for reply length
+ * @resp_len: in/out param for reply length
*/
int ceph_osdc_call(struct ceph_osd_client *osdc,
struct ceph_object_id *oid,
struct ceph_osd_request *req;
int ret;
+ if (req_len > PAGE_SIZE || (resp_page && *resp_len > PAGE_SIZE))
+ return -E2BIG;
+
req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
if (!req)
return -ENOMEM;
0, false, false);
if (resp_page)
osd_req_op_cls_response_data_pages(req, 0, &resp_page,
- PAGE_SIZE, 0, false, false);
+ *resp_len, 0, false, false);
ceph_osdc_start_request(osdc, req, false);
ret = ceph_osdc_wait_request(osdc, req);
int page_align = off & ~PAGE_MASK;
req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
- CEPH_OSD_OP_WRITE,
- CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
+ CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
snapc, truncate_seq, truncate_size,
true);
if (IS_ERR(req))
return -EINVAL;
}
+static void crush_finalize(struct crush_map *c)
+{
+ __s32 b;
+
+ /* Space for the array of pointers to per-bucket workspace */
+ c->working_size = sizeof(struct crush_work) +
+ c->max_buckets * sizeof(struct crush_work_bucket *);
+
+ for (b = 0; b < c->max_buckets; b++) {
+ if (!c->buckets[b])
+ continue;
+
+ switch (c->buckets[b]->alg) {
+ default:
+ /*
+ * The base case, permutation variables and
+ * the pointer to the permutation array.
+ */
+ c->working_size += sizeof(struct crush_work_bucket);
+ break;
+ }
+ /* Every bucket has a permutation array. */
+ c->working_size += c->buckets[b]->size * sizeof(__u32);
+ }
+}
+
static struct crush_map *crush_decode(void *pbyval, void *end)
{
struct crush_map *c;
b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
if (b->items == NULL)
goto badmem;
- b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
- if (b->perm == NULL)
- goto badmem;
- b->perm_n = 0;
ceph_decode_need(p, end, b->size*sizeof(u32), bad);
for (j = 0; j < b->size; j++)
dout("crush decode tunable chooseleaf_stable = %d\n",
c->chooseleaf_stable);
+ crush_finalize(c);
+
done:
dout("crush_decode success\n");
return c;
map->pool_max = -1;
map->pg_temp = RB_ROOT;
map->primary_temp = RB_ROOT;
- mutex_init(&map->crush_scratch_mutex);
+ mutex_init(&map->crush_workspace_mutex);
return map;
}
kfree(map->osd_weight);
kfree(map->osd_addr);
kfree(map->osd_primary_affinity);
+ kfree(map->crush_workspace);
kfree(map);
}
return 0;
}
+static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
+{
+ void *workspace;
+ size_t work_size;
+
+ if (IS_ERR(crush))
+ return PTR_ERR(crush);
+
+ work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE);
+ dout("%s work_size %zu bytes\n", __func__, work_size);
+ workspace = kmalloc(work_size, GFP_NOIO);
+ if (!workspace) {
+ crush_destroy(crush);
+ return -ENOMEM;
+ }
+ crush_init_workspace(crush, workspace);
+
+ if (map->crush)
+ crush_destroy(map->crush);
+ kfree(map->crush_workspace);
+ map->crush = crush;
+ map->crush_workspace = workspace;
+ return 0;
+}
+
#define OSDMAP_WRAPPER_COMPAT_VER 7
#define OSDMAP_CLIENT_DATA_COMPAT_VER 1
/* crush */
ceph_decode_32_safe(p, end, len, e_inval);
- map->crush = crush_decode(*p, min(*p + len, end));
- if (IS_ERR(map->crush)) {
- err = PTR_ERR(map->crush);
- map->crush = NULL;
+ err = osdmap_set_crush(map, crush_decode(*p, min(*p + len, end)));
+ if (err)
goto bad;
- }
- *p += len;
/* ignore the rest */
*p = end;
struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
struct ceph_osdmap *map)
{
- struct crush_map *newcrush = NULL;
struct ceph_fsid fsid;
u32 epoch = 0;
struct ceph_timespec modified;
/* new crush? */
ceph_decode_32_safe(p, end, len, e_inval);
if (len > 0) {
- newcrush = crush_decode(*p, min(*p+len, end));
- if (IS_ERR(newcrush)) {
- err = PTR_ERR(newcrush);
- newcrush = NULL;
+ err = osdmap_set_crush(map,
+ crush_decode(*p, min(*p + len, end)));
+ if (err)
goto bad;
- }
*p += len;
}
map->epoch++;
map->modified = modified;
- if (newcrush) {
- if (map->crush)
- crush_destroy(map->crush);
- map->crush = newcrush;
- newcrush = NULL;
- }
/* new_pools */
err = decode_new_pools(p, end, map);
print_hex_dump(KERN_DEBUG, "osdmap: ",
DUMP_PREFIX_OFFSET, 16, 1,
start, end - start, true);
- if (newcrush)
- crush_destroy(newcrush);
return ERR_PTR(err);
}
BUG_ON(result_max > CEPH_PG_MAX_SIZE);
- mutex_lock(&map->crush_scratch_mutex);
+ mutex_lock(&map->crush_workspace_mutex);
r = crush_do_rule(map->crush, ruleno, x, result, result_max,
- weight, weight_max, map->crush_scratch_ary);
- mutex_unlock(&map->crush_scratch_mutex);
+ weight, weight_max, map->crush_workspace);
+ mutex_unlock(&map->crush_workspace_mutex);
return r;
}
return;
}
- len = do_crush(osdmap, ruleno, pps, raw->osds,
- min_t(int, pi->size, ARRAY_SIZE(raw->osds)),
+ if (pi->size > ARRAY_SIZE(raw->osds)) {
+ pr_err_ratelimited("pool %lld ruleset %d type %d too wide: size %d > %zu\n",
+ pi->id, pi->crush_ruleset, pi->type, pi->size,
+ ARRAY_SIZE(raw->osds));
+ return;
+ }
+
+ len = do_crush(osdmap, ruleno, pps, raw->osds, pi->size,
osdmap->osd_weight, osdmap->max_osd);
if (len < 0) {
pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
* 02110-1301, USA.
*/
-#include <stddef.h>
-
#include <linux/types.h>
#include <linux/export.h>
#include <linux/ceph/libceph.h>
int ret, no_addr;
struct fib_result res;
struct flowi4 fl4;
- struct net *net;
+ struct net *net = dev_net(dev);
bool dev_match;
fl4.flowi4_oif = 0;
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
fl4.flowi4_tun_key.tun_id = 0;
fl4.flowi4_flags = 0;
+ fl4.flowi4_uid = sock_net_uid(net, NULL);
no_addr = idev->ifa_list == NULL;
trace_fib_validate_source(dev, &fl4);
- net = dev_net(dev);
if (fib_lookup(net, &fl4, &res, 0))
goto last_resort;
if (res.type != RTN_UNICAST &&
(res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
goto e_inval;
- if (!rpf && !fib_num_tclassid_users(dev_net(dev)) &&
+ if (!rpf && !fib_num_tclassid_users(net) &&
(dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev)))
goto last_resort;
fib_combine_itag(itag, &res);
fl4.flowi4_flags = 0;
fl4.daddr = daddr;
fl4.saddr = saddr;
+ fl4.flowi4_uid = sock_net_uid(net, NULL);
err = fib_lookup(net, &fl4, &res, 0);
if (err != 0) {
if (!IN_DEV_FORWARD(in_dev))
{
int res;
+ tos &= IPTOS_RT_MASK;
rcu_read_lock();
/* Multicast recognition logic is moved from route cache to here.
u->link = p->link;
u->i_key = p->i_key;
u->o_key = p->o_key;
+ if (u->i_key)
+ u->i_flags |= GRE_KEY;
+ if (u->o_key)
+ u->o_flags |= GRE_KEY;
u->proto = p->proto;
memcpy(u->name, p->name, sizeof(u->name));
struct net *net = sock_net(sk);
struct mr6_table *mrt;
+ if (sk->sk_type != SOCK_RAW ||
+ inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
+ return -EOPNOTSUPP;
+
mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
if (!mrt)
return -ENOENT;
switch (optname) {
case MRT6_INIT:
- if (sk->sk_type != SOCK_RAW ||
- inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
- return -EOPNOTSUPP;
if (optlen < sizeof(int))
return -EINVAL;
struct net *net = sock_net(sk);
struct mr6_table *mrt;
+ if (sk->sk_type != SOCK_RAW ||
+ inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
+ return -EOPNOTSUPP;
+
mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
if (!mrt)
return -ENOENT;
drop:
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS);
kfree_skb(skb);
- return -1;
+ return 0;
}
/* Userspace will call sendmsg() on the tunnel socket to send L2TP
struct net *net = nf_ct_exp_net(expect);
struct hlist_node *next;
unsigned int h;
- int ret = 1;
+ int ret = 0;
if (!master_help) {
ret = -ESHUTDOWN;
spin_lock_bh(&nf_conntrack_expect_lock);
ret = __nf_ct_expect_check(expect);
- if (ret <= 0)
+ if (ret < 0)
goto out;
nf_ct_expect_insert(expect);
spin_unlock_bh(&nf_conntrack_expect_lock);
nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report);
- return ret;
+ return 0;
out:
spin_unlock_bh(&nf_conntrack_expect_lock);
return ret;
if (!nft_ct_tmpl_alloc_pcpu())
return -ENOMEM;
nft_ct_pcpu_template_refcnt++;
+ len = sizeof(u16);
break;
#endif
default:
{
struct nft_bitmap *priv = nft_set_priv(set);
- priv->bitmap_size = nft_bitmap_total_size(set->klen);
+ priv->bitmap_size = nft_bitmap_size(set->klen);
return 0;
}
kfree(i_ipaddr);
}
- if (rds_ibdev->vector_load)
- kfree(rds_ibdev->vector_load);
+ kfree(rds_ibdev->vector_load);
kfree(rds_ibdev);
}
ret = register_netdevice_notifier(&rds_tcp_dev_notifier);
if (ret) {
pr_warn("could not register rds_tcp_dev_notifier\n");
- goto out;
+ goto out_slab;
}
ret = register_pernet_subsys(&rds_tcp_net_ops);
if (ret)
- goto out_slab;
+ goto out_notifier;
ret = rds_tcp_recv_init();
if (ret)
rds_tcp_recv_exit();
out_pernet:
unregister_pernet_subsys(&rds_tcp_net_ops);
-out_slab:
+out_notifier:
if (unregister_netdevice_notifier(&rds_tcp_dev_notifier))
pr_warn("could not unregister rds_tcp_dev_notifier\n");
+out_slab:
kmem_cache_destroy(rds_tcp_conn_slab);
out:
return ret;
switch (token->security_index) {
case RXRPC_SECURITY_RXKAD:
- toksize += 8 * 4; /* viceid, kvno, key*2, begin,
+ toksize += 9 * 4; /* viceid, kvno, key*2 + len, begin,
* end, primary, tktlen */
toksize += RND(token->kad->ticket_len);
break;
/* Barriers against rxrpc_input_data(). */
hard_ack = call->rx_hard_ack;
- top = smp_load_acquire(&call->rx_top);
- for (seq = hard_ack + 1; before_eq(seq, top); seq++) {
+ seq = hard_ack + 1;
+ while (top = smp_load_acquire(&call->rx_top),
+ before_eq(seq, top)
+ ) {
ix = seq & RXRPC_RXTX_BUFF_MASK;
skb = call->rxtx_buffer[ix];
if (!skb) {
ret = 1;
goto out;
}
+
+ seq++;
}
out:
goto err_mod;
}
- err = nla_memdup_cookie(a, tb);
- if (err < 0) {
+ if (nla_memdup_cookie(a, tb) < 0) {
+ err = -ENOMEM;
tcf_hash_release(a, bind);
goto err_mod;
}
goto out_module_put;
err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops);
- if (err < 0)
+ if (err <= 0)
goto out_module_put;
- if (err == 0)
- goto noflush_out;
nla_nest_end(skb, nest);
out_module_put:
module_put(ops->owner);
err_out:
-noflush_out:
kfree_skb(skb);
return err;
}
sctp_scope_t scope, gfp_t gfp, int copy_flags)
{
struct sctp_sockaddr_entry *addr;
+ union sctp_addr laddr;
int error = 0;
rcu_read_lock();
!(copy_flags & SCTP_ADDR6_PEERSUPP)))
continue;
- if (sctp_bind_addr_state(bp, &addr->a) != -1)
+ laddr = addr->a;
+ /* also works for setting ipv6 address port */
+ laddr.v4.sin_port = htons(bp->port);
+ if (sctp_bind_addr_state(bp, &laddr) != -1)
continue;
error = sctp_add_bind_addr(bp, &addr->a, sizeof(addr->a),
if (!asoc)
return -EINVAL;
+ /* If there is a thread waiting on more sndbuf space for
+ * sending on this asoc, it cannot be peeled.
+ */
+ if (waitqueue_active(&asoc->wait))
+ return -EBUSY;
+
/* An association cannot be branched off from an already peeled-off
* socket, nor is this supported for tcp style sockets.
*/
*/
release_sock(sk);
current_timeo = schedule_timeout(current_timeo);
- if (sk != asoc->base.sk)
- goto do_error;
lock_sock(sk);
*timeo_p = current_timeo;
case RPC_GSS_PROC_DESTROY:
if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq))
goto auth_err;
- rsci->h.expiry_time = seconds_since_boot();
- set_bit(CACHE_NEGATIVE, &rsci->h.flags);
+ /* Delete the entry from the cache_list and call cache_put */
+ sunrpc_cache_unhash(sn->rsc_cache, &rsci->h);
if (resv->iov_len + 4 > PAGE_SIZE)
goto drop;
svc_putnl(resv, RPC_SUCCESS);
cache_purge(cd);
spin_lock(&cache_list_lock);
write_lock(&cd->hash_lock);
- if (cd->entries) {
- write_unlock(&cd->hash_lock);
- spin_unlock(&cache_list_lock);
- goto out;
- }
if (current_detail == cd)
current_detail = NULL;
list_del_init(&cd->others);
/* module must be being unloaded so its safe to kill the worker */
cancel_delayed_work_sync(&cache_cleaner);
}
- return;
-out:
- printk(KERN_ERR "RPC: failed to unregister %s cache\n", cd->name);
}
EXPORT_SYMBOL_GPL(sunrpc_destroy_cache_detail);
void cache_purge(struct cache_detail *detail)
{
- time_t now = seconds_since_boot();
- if (detail->flush_time >= now)
- now = detail->flush_time + 1;
- /* 'now' is the maximum value any 'last_refresh' can have */
- detail->flush_time = now;
- detail->nextcheck = seconds_since_boot();
- cache_flush();
+ struct cache_head *ch = NULL;
+ struct hlist_head *head = NULL;
+ struct hlist_node *tmp = NULL;
+ int i = 0;
+
+ write_lock(&detail->hash_lock);
+ if (!detail->entries) {
+ write_unlock(&detail->hash_lock);
+ return;
+ }
+
+ dprintk("RPC: %d entries in %s cache\n", detail->entries, detail->name);
+ for (i = 0; i < detail->hash_size; i++) {
+ head = &detail->hash_table[i];
+ hlist_for_each_entry_safe(ch, tmp, head, cache_list) {
+ hlist_del_init(&ch->cache_list);
+ detail->entries--;
+
+ set_bit(CACHE_CLEANED, &ch->flags);
+ write_unlock(&detail->hash_lock);
+ cache_fresh_unlocked(ch, detail);
+ cache_put(ch, detail);
+ write_lock(&detail->hash_lock);
+ }
+ }
+ write_unlock(&detail->hash_lock);
}
EXPORT_SYMBOL_GPL(cache_purge);
}
EXPORT_SYMBOL_GPL(sunrpc_cache_unregister_pipefs);
+void sunrpc_cache_unhash(struct cache_detail *cd, struct cache_head *h)
+{
+ write_lock(&cd->hash_lock);
+ if (!hlist_unhashed(&h->cache_list)){
+ hlist_del_init(&h->cache_list);
+ cd->entries--;
+ write_unlock(&cd->hash_lock);
+ cache_put(h, cd);
+ } else
+ write_unlock(&cd->hash_lock);
+}
+EXPORT_SYMBOL_GPL(sunrpc_cache_unhash);
for (i = 0; i < progp->pg_nvers; i++) {
if (progp->pg_vers[i] == NULL)
continue;
- if (progp->pg_vers[i]->vs_hidden == 0)
+ if (!progp->pg_vers[i]->vs_hidden)
return 1;
}
}
if (vers->vs_hidden)
continue;
+ /*
+ * Don't register a UDP port if we need congestion
+ * control.
+ */
+ if (vers->vs_need_cong_ctrl && proto == IPPROTO_UDP)
+ continue;
+
error = __svc_register(net, progp->pg_name, progp->pg_prog,
i, family, proto, port);
!(versp = progp->pg_vers[vers]))
goto err_bad_vers;
+ /*
+ * Some protocol versions (namely NFSv4) require some form of
+ * congestion control. (See RFC 7530 section 3.1 paragraph 2)
+ * In other words, UDP is not allowed. We mark those when setting
+ * up the svc_xprt, and verify that here.
+ *
+ * The spec is not very clear about what error should be returned
+ * when someone tries to access a server that is listening on UDP
+ * for lower versions. RPC_PROG_MISMATCH seems to be the closest
+ * fit.
+ */
+ if (versp->vs_need_cong_ctrl &&
+ !test_bit(XPT_CONG_CTRL, &rqstp->rq_xprt->xpt_flags))
+ goto err_bad_vers;
+
procp = versp->vs_proc + proc;
if (proc >= versp->vs_nproc || !procp->pc_func)
goto err_bad_proc;
svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_tcp_class,
&svsk->sk_xprt, serv);
set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
+ set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags);
if (sk->sk_state == TCP_LISTEN) {
dprintk("setting up TCP socket for listening\n");
set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
{
struct rpc_xprt *xprt = rqst->rq_xprt;
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
- struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)rqst->rq_buffer;
+ __be32 *p;
int rc;
/* Space in the send buffer for an RPC/RDMA header is reserved
* via xprt->tsh_size.
*/
- headerp->rm_xid = rqst->rq_xid;
- headerp->rm_vers = rpcrdma_version;
- headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests);
- headerp->rm_type = rdma_msg;
- headerp->rm_body.rm_chunks[0] = xdr_zero;
- headerp->rm_body.rm_chunks[1] = xdr_zero;
- headerp->rm_body.rm_chunks[2] = xdr_zero;
+ p = rqst->rq_buffer;
+ *p++ = rqst->rq_xid;
+ *p++ = rpcrdma_version;
+ *p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests);
+ *p++ = rdma_msg;
+ *p++ = xdr_zero;
+ *p++ = xdr_zero;
+ *p = xdr_zero;
#ifdef SVCRDMA_BACKCHANNEL_DEBUG
pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer);
/*
+ * Copyright (c) 2016 Oracle. All rights reserved.
* Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
*
* This software is available to you under a choice of one of two
#define RPCDBG_FACILITY RPCDBG_SVCXPRT
-/*
- * Decodes a read chunk list. The expected format is as follows:
- * descrim : xdr_one
- * position : __be32 offset into XDR stream
- * handle : __be32 RKEY
- * . . .
- * end-of-list: xdr_zero
- */
-static __be32 *decode_read_list(__be32 *va, __be32 *vaend)
+static __be32 *xdr_check_read_list(__be32 *p, __be32 *end)
{
- struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va;
+ __be32 *next;
- while (ch->rc_discrim != xdr_zero) {
- if (((unsigned long)ch + sizeof(struct rpcrdma_read_chunk)) >
- (unsigned long)vaend) {
- dprintk("svcrdma: vaend=%p, ch=%p\n", vaend, ch);
+ while (*p++ != xdr_zero) {
+ next = p + rpcrdma_readchunk_maxsz - 1;
+ if (next > end)
return NULL;
- }
- ch++;
+ p = next;
}
- return &ch->rc_position;
+ return p;
}
-/*
- * Decodes a write chunk list. The expected format is as follows:
- * descrim : xdr_one
- * nchunks : <count>
- * handle : __be32 RKEY ---+
- * length : __be32 <len of segment> |
- * offset : remove va + <count>
- * . . . |
- * ---+
- */
-static __be32 *decode_write_list(__be32 *va, __be32 *vaend)
+static __be32 *xdr_check_write_list(__be32 *p, __be32 *end)
{
- unsigned long start, end;
- int nchunks;
-
- struct rpcrdma_write_array *ary =
- (struct rpcrdma_write_array *)va;
+ __be32 *next;
- /* Check for not write-array */
- if (ary->wc_discrim == xdr_zero)
- return &ary->wc_nchunks;
-
- if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
- (unsigned long)vaend) {
- dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
- return NULL;
- }
- nchunks = be32_to_cpu(ary->wc_nchunks);
-
- start = (unsigned long)&ary->wc_array[0];
- end = (unsigned long)vaend;
- if (nchunks < 0 ||
- nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) ||
- (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) {
- dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
- ary, nchunks, vaend);
- return NULL;
+ while (*p++ != xdr_zero) {
+ next = p + 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz;
+ if (next > end)
+ return NULL;
+ p = next;
}
- /*
- * rs_length is the 2nd 4B field in wc_target and taking its
- * address skips the list terminator
- */
- return &ary->wc_array[nchunks].wc_target.rs_length;
+ return p;
}
-static __be32 *decode_reply_array(__be32 *va, __be32 *vaend)
+static __be32 *xdr_check_reply_chunk(__be32 *p, __be32 *end)
{
- unsigned long start, end;
- int nchunks;
- struct rpcrdma_write_array *ary =
- (struct rpcrdma_write_array *)va;
-
- /* Check for no reply-array */
- if (ary->wc_discrim == xdr_zero)
- return &ary->wc_nchunks;
-
- if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
- (unsigned long)vaend) {
- dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
- return NULL;
- }
- nchunks = be32_to_cpu(ary->wc_nchunks);
-
- start = (unsigned long)&ary->wc_array[0];
- end = (unsigned long)vaend;
- if (nchunks < 0 ||
- nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) ||
- (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) {
- dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
- ary, nchunks, vaend);
- return NULL;
+ __be32 *next;
+
+ if (*p++ != xdr_zero) {
+ next = p + 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz;
+ if (next > end)
+ return NULL;
+ p = next;
}
- return (__be32 *)&ary->wc_array[nchunks];
+ return p;
}
/**
*/
int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg)
{
- struct rpcrdma_msg *rmsgp;
- __be32 *va, *vaend;
- unsigned int len;
- u32 hdr_len;
+ __be32 *p, *end, *rdma_argp;
+ unsigned int hdr_len;
/* Verify that there's enough bytes for header + something */
- if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) {
- dprintk("svcrdma: header too short = %d\n",
- rq_arg->len);
- return -EINVAL;
- }
+ if (rq_arg->len <= RPCRDMA_HDRLEN_ERR)
+ goto out_short;
- rmsgp = (struct rpcrdma_msg *)rq_arg->head[0].iov_base;
- if (rmsgp->rm_vers != rpcrdma_version) {
- dprintk("%s: bad version %u\n", __func__,
- be32_to_cpu(rmsgp->rm_vers));
- return -EPROTONOSUPPORT;
- }
+ rdma_argp = rq_arg->head[0].iov_base;
+ if (*(rdma_argp + 1) != rpcrdma_version)
+ goto out_version;
- switch (be32_to_cpu(rmsgp->rm_type)) {
- case RDMA_MSG:
- case RDMA_NOMSG:
+ switch (*(rdma_argp + 3)) {
+ case rdma_msg:
+ case rdma_nomsg:
break;
- case RDMA_DONE:
- /* Just drop it */
- dprintk("svcrdma: dropping RDMA_DONE message\n");
- return 0;
-
- case RDMA_ERROR:
- /* Possible if this is a backchannel reply.
- * XXX: We should cancel this XID, though.
- */
- dprintk("svcrdma: dropping RDMA_ERROR message\n");
- return 0;
-
- case RDMA_MSGP:
- /* Pull in the extra for the padded case, bump our pointer */
- rmsgp->rm_body.rm_padded.rm_align =
- be32_to_cpu(rmsgp->rm_body.rm_padded.rm_align);
- rmsgp->rm_body.rm_padded.rm_thresh =
- be32_to_cpu(rmsgp->rm_body.rm_padded.rm_thresh);
-
- va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
- rq_arg->head[0].iov_base = va;
- len = (u32)((unsigned long)va - (unsigned long)rmsgp);
- rq_arg->head[0].iov_len -= len;
- if (len > rq_arg->len)
- return -EINVAL;
- return len;
- default:
- dprintk("svcrdma: bad rdma procedure (%u)\n",
- be32_to_cpu(rmsgp->rm_type));
- return -EINVAL;
- }
+ case rdma_done:
+ goto out_drop;
- /* The chunk list may contain either a read chunk list or a write
- * chunk list and a reply chunk list.
- */
- va = &rmsgp->rm_body.rm_chunks[0];
- vaend = (__be32 *)((unsigned long)rmsgp + rq_arg->len);
- va = decode_read_list(va, vaend);
- if (!va) {
- dprintk("svcrdma: failed to decode read list\n");
- return -EINVAL;
- }
- va = decode_write_list(va, vaend);
- if (!va) {
- dprintk("svcrdma: failed to decode write list\n");
- return -EINVAL;
- }
- va = decode_reply_array(va, vaend);
- if (!va) {
- dprintk("svcrdma: failed to decode reply chunk\n");
- return -EINVAL;
+ case rdma_error:
+ goto out_drop;
+
+ default:
+ goto out_proc;
}
- rq_arg->head[0].iov_base = va;
- hdr_len = (unsigned long)va - (unsigned long)rmsgp;
+ end = (__be32 *)((unsigned long)rdma_argp + rq_arg->len);
+ p = xdr_check_read_list(rdma_argp + 4, end);
+ if (!p)
+ goto out_inval;
+ p = xdr_check_write_list(p, end);
+ if (!p)
+ goto out_inval;
+ p = xdr_check_reply_chunk(p, end);
+ if (!p)
+ goto out_inval;
+ if (p > end)
+ goto out_inval;
+
+ rq_arg->head[0].iov_base = p;
+ hdr_len = (unsigned long)p - (unsigned long)rdma_argp;
rq_arg->head[0].iov_len -= hdr_len;
return hdr_len;
+
+out_short:
+ dprintk("svcrdma: header too short = %d\n", rq_arg->len);
+ return -EINVAL;
+
+out_version:
+ dprintk("svcrdma: bad xprt version: %u\n",
+ be32_to_cpup(rdma_argp + 1));
+ return -EPROTONOSUPPORT;
+
+out_drop:
+ dprintk("svcrdma: dropping RDMA_DONE/ERROR message\n");
+ return 0;
+
+out_proc:
+ dprintk("svcrdma: bad rdma procedure (%u)\n",
+ be32_to_cpup(rdma_argp + 3));
+ return -EINVAL;
+
+out_inval:
+ dprintk("svcrdma: failed to parse transport header\n");
+ return -EINVAL;
}
int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
*va++ = rmsgp->rm_xid;
*va++ = rmsgp->rm_vers;
- *va++ = cpu_to_be32(xprt->sc_max_requests);
+ *va++ = xprt->sc_fc_credits;
*va++ = rdma_error;
*va++ = cpu_to_be32(err);
if (err == ERR_VERS) {
return (int)((unsigned long)va - (unsigned long)startp);
}
-int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp)
+/**
+ * svc_rdma_xdr_get_reply_hdr_length - Get length of Reply transport header
+ * @rdma_resp: buffer containing Reply transport header
+ *
+ * Returns length of transport header, in bytes.
+ */
+unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp)
{
- struct rpcrdma_write_array *wr_ary;
+ unsigned int nsegs;
+ __be32 *p;
- /* There is no read-list in a reply */
+ p = rdma_resp;
- /* skip write list */
- wr_ary = (struct rpcrdma_write_array *)
- &rmsgp->rm_body.rm_chunks[1];
- if (wr_ary->wc_discrim)
- wr_ary = (struct rpcrdma_write_array *)
- &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)].
- wc_target.rs_length;
- else
- wr_ary = (struct rpcrdma_write_array *)
- &wr_ary->wc_nchunks;
-
- /* skip reply array */
- if (wr_ary->wc_discrim)
- wr_ary = (struct rpcrdma_write_array *)
- &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)];
- else
- wr_ary = (struct rpcrdma_write_array *)
- &wr_ary->wc_nchunks;
-
- return (unsigned long) wr_ary - (unsigned long) rmsgp;
+ /* RPC-over-RDMA V1 replies never have a Read list. */
+ p += rpcrdma_fixed_maxsz + 1;
+
+ /* Skip Write list. */
+ while (*p++ != xdr_zero) {
+ nsegs = be32_to_cpup(p++);
+ p += nsegs * rpcrdma_segment_maxsz;
+ }
+
+ /* Skip Reply chunk. */
+ if (*p++ != xdr_zero) {
+ nsegs = be32_to_cpup(p++);
+ p += nsegs * rpcrdma_segment_maxsz;
+ }
+
+ return (unsigned long)p - (unsigned long)rdma_resp;
}
void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks)
seg->rs_offset = rs_offset;
seg->rs_length = cpu_to_be32(write_len);
}
-
-void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt,
- struct rpcrdma_msg *rdma_argp,
- struct rpcrdma_msg *rdma_resp,
- enum rpcrdma_proc rdma_type)
-{
- rdma_resp->rm_xid = rdma_argp->rm_xid;
- rdma_resp->rm_vers = rdma_argp->rm_vers;
- rdma_resp->rm_credit = cpu_to_be32(xprt->sc_max_requests);
- rdma_resp->rm_type = cpu_to_be32(rdma_type);
-
- /* Encode <nul> chunks lists */
- rdma_resp->rm_body.rm_chunks[0] = xdr_zero;
- rdma_resp->rm_body.rm_chunks[1] = xdr_zero;
- rdma_resp->rm_body.rm_chunks[2] = xdr_zero;
-}
dprintk("svcrdma: rqstp=%p\n", rqstp);
- spin_lock_bh(&rdma_xprt->sc_rq_dto_lock);
+ spin_lock(&rdma_xprt->sc_rq_dto_lock);
if (!list_empty(&rdma_xprt->sc_read_complete_q)) {
- ctxt = list_entry(rdma_xprt->sc_read_complete_q.next,
- struct svc_rdma_op_ctxt,
- dto_q);
- list_del_init(&ctxt->dto_q);
- spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
+ ctxt = list_first_entry(&rdma_xprt->sc_read_complete_q,
+ struct svc_rdma_op_ctxt, list);
+ list_del(&ctxt->list);
+ spin_unlock(&rdma_xprt->sc_rq_dto_lock);
rdma_read_complete(rqstp, ctxt);
goto complete;
} else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
- ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next,
- struct svc_rdma_op_ctxt,
- dto_q);
- list_del_init(&ctxt->dto_q);
+ ctxt = list_first_entry(&rdma_xprt->sc_rq_dto_q,
+ struct svc_rdma_op_ctxt, list);
+ list_del(&ctxt->list);
} else {
atomic_inc(&rdma_stat_rq_starve);
clear_bit(XPT_DATA, &xprt->xpt_flags);
ctxt = NULL;
}
- spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
+ spin_unlock(&rdma_xprt->sc_rq_dto_lock);
if (!ctxt) {
/* This is the EAGAIN path. The svc_recv routine will
* return -EAGAIN, the nfsd thread will go to call into
/* Prepare the SGE for the RPCRDMA Header */
ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
- ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
+ ctxt->sge[0].length =
+ svc_rdma_xdr_get_reply_hdr_len((__be32 *)rdma_resp);
ctxt->sge[0].addr =
ib_dma_map_page(rdma->sc_cm_id->device, page, 0,
ctxt->sge[0].length, DMA_TO_DEVICE);
struct rpcrdma_msg *rdma_argp;
struct rpcrdma_msg *rdma_resp;
struct rpcrdma_write_array *wr_ary, *rp_ary;
- enum rpcrdma_proc reply_type;
int ret;
int inline_bytes;
struct page *res_page;
struct svc_rdma_req_map *vec;
u32 inv_rkey;
+ __be32 *p;
dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
if (!res_page)
goto err0;
rdma_resp = page_address(res_page);
- if (rp_ary)
- reply_type = RDMA_NOMSG;
- else
- reply_type = RDMA_MSG;
- svc_rdma_xdr_encode_reply_header(rdma, rdma_argp,
- rdma_resp, reply_type);
+
+ p = &rdma_resp->rm_xid;
+ *p++ = rdma_argp->rm_xid;
+ *p++ = rdma_argp->rm_vers;
+ *p++ = rdma->sc_fc_credits;
+ *p++ = rp_ary ? rdma_nomsg : rdma_msg;
+
+ /* Start with empty chunks */
+ *p++ = xdr_zero;
+ *p++ = xdr_zero;
+ *p = xdr_zero;
/* Send any write-chunk data and build resp write-list */
if (wr_ary) {
ctxt = kmalloc(sizeof(*ctxt), flags);
if (ctxt) {
ctxt->xprt = xprt;
- INIT_LIST_HEAD(&ctxt->free);
- INIT_LIST_HEAD(&ctxt->dto_q);
+ INIT_LIST_HEAD(&ctxt->list);
}
return ctxt;
}
dprintk("svcrdma: No memory for RDMA ctxt\n");
return false;
}
- list_add(&ctxt->free, &xprt->sc_ctxts);
+ list_add(&ctxt->list, &xprt->sc_ctxts);
}
return true;
}
{
struct svc_rdma_op_ctxt *ctxt = NULL;
- spin_lock_bh(&xprt->sc_ctxt_lock);
+ spin_lock(&xprt->sc_ctxt_lock);
xprt->sc_ctxt_used++;
if (list_empty(&xprt->sc_ctxts))
goto out_empty;
ctxt = list_first_entry(&xprt->sc_ctxts,
- struct svc_rdma_op_ctxt, free);
- list_del_init(&ctxt->free);
- spin_unlock_bh(&xprt->sc_ctxt_lock);
+ struct svc_rdma_op_ctxt, list);
+ list_del(&ctxt->list);
+ spin_unlock(&xprt->sc_ctxt_lock);
out:
ctxt->count = 0;
/* Either pre-allocation missed the mark, or send
* queue accounting is broken.
*/
- spin_unlock_bh(&xprt->sc_ctxt_lock);
+ spin_unlock(&xprt->sc_ctxt_lock);
ctxt = alloc_ctxt(xprt, GFP_NOIO);
if (ctxt)
goto out;
- spin_lock_bh(&xprt->sc_ctxt_lock);
+ spin_lock(&xprt->sc_ctxt_lock);
xprt->sc_ctxt_used--;
- spin_unlock_bh(&xprt->sc_ctxt_lock);
+ spin_unlock(&xprt->sc_ctxt_lock);
WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n");
return NULL;
}
for (i = 0; i < ctxt->count; i++)
put_page(ctxt->pages[i]);
- spin_lock_bh(&xprt->sc_ctxt_lock);
+ spin_lock(&xprt->sc_ctxt_lock);
xprt->sc_ctxt_used--;
- list_add(&ctxt->free, &xprt->sc_ctxts);
- spin_unlock_bh(&xprt->sc_ctxt_lock);
+ list_add(&ctxt->list, &xprt->sc_ctxts);
+ spin_unlock(&xprt->sc_ctxt_lock);
}
static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
struct svc_rdma_op_ctxt *ctxt;
ctxt = list_first_entry(&xprt->sc_ctxts,
- struct svc_rdma_op_ctxt, free);
- list_del(&ctxt->free);
+ struct svc_rdma_op_ctxt, list);
+ list_del(&ctxt->list);
kfree(ctxt);
}
}
/* All wc fields are now known to be valid */
ctxt->byte_len = wc->byte_len;
spin_lock(&xprt->sc_rq_dto_lock);
- list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q);
+ list_add_tail(&ctxt->list, &xprt->sc_rq_dto_q);
spin_unlock(&xprt->sc_rq_dto_lock);
set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
read_hdr = ctxt->read_hdr;
spin_lock(&xprt->sc_rq_dto_lock);
- list_add_tail(&read_hdr->dto_q,
+ list_add_tail(&read_hdr->list,
&xprt->sc_read_complete_q);
spin_unlock(&xprt->sc_rq_dto_lock);
return NULL;
svc_xprt_init(&init_net, &svc_rdma_class, &cma_xprt->sc_xprt, serv);
INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
- INIT_LIST_HEAD(&cma_xprt->sc_dto_q);
INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
spin_lock_init(&cma_xprt->sc_ctxt_lock);
spin_lock_init(&cma_xprt->sc_map_lock);
+ /*
+ * Note that this implies that the underlying transport support
+ * has some form of congestion control (see RFC 7530 section 3.1
+ * paragraph 2). For now, we assume that all supported RDMA
+ * transports are suitable here.
+ */
+ set_bit(XPT_CONG_CTRL, &cma_xprt->sc_xprt.xpt_flags);
+
if (listener)
set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
{
struct svc_rdma_fastreg_mr *frmr = NULL;
- spin_lock_bh(&rdma->sc_frmr_q_lock);
+ spin_lock(&rdma->sc_frmr_q_lock);
if (!list_empty(&rdma->sc_frmr_q)) {
frmr = list_entry(rdma->sc_frmr_q.next,
struct svc_rdma_fastreg_mr, frmr_list);
list_del_init(&frmr->frmr_list);
frmr->sg_nents = 0;
}
- spin_unlock_bh(&rdma->sc_frmr_q_lock);
+ spin_unlock(&rdma->sc_frmr_q_lock);
if (frmr)
return frmr;
if (frmr) {
ib_dma_unmap_sg(rdma->sc_cm_id->device,
frmr->sg, frmr->sg_nents, frmr->direction);
- spin_lock_bh(&rdma->sc_frmr_q_lock);
+ spin_lock(&rdma->sc_frmr_q_lock);
WARN_ON_ONCE(!list_empty(&frmr->frmr_list));
list_add(&frmr->frmr_list, &rdma->sc_frmr_q);
- spin_unlock_bh(&rdma->sc_frmr_q_lock);
+ spin_unlock(&rdma->sc_frmr_q_lock);
}
}
newxprt->sc_max_req_size = svcrdma_max_req_size;
newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr,
svcrdma_max_requests);
+ newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests);
newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr,
svcrdma_max_bc_requests);
newxprt->sc_rq_depth = newxprt->sc_max_requests +
goto errout;
}
newxprt->sc_sq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_sq_depth,
- 0, IB_POLL_SOFTIRQ);
+ 0, IB_POLL_WORKQUEUE);
if (IS_ERR(newxprt->sc_sq_cq)) {
dprintk("svcrdma: error creating SQ CQ for connect request\n");
goto errout;
}
newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_rq_depth,
- 0, IB_POLL_SOFTIRQ);
+ 0, IB_POLL_WORKQUEUE);
if (IS_ERR(newxprt->sc_rq_cq)) {
dprintk("svcrdma: error creating RQ CQ for connect request\n");
goto errout;
*/
while (!list_empty(&rdma->sc_read_complete_q)) {
struct svc_rdma_op_ctxt *ctxt;
- ctxt = list_entry(rdma->sc_read_complete_q.next,
- struct svc_rdma_op_ctxt,
- dto_q);
- list_del_init(&ctxt->dto_q);
+ ctxt = list_first_entry(&rdma->sc_read_complete_q,
+ struct svc_rdma_op_ctxt, list);
+ list_del(&ctxt->list);
svc_rdma_put_context(ctxt, 1);
}
/* Destroy queued, but not processed recv completions */
while (!list_empty(&rdma->sc_rq_dto_q)) {
struct svc_rdma_op_ctxt *ctxt;
- ctxt = list_entry(rdma->sc_rq_dto_q.next,
- struct svc_rdma_op_ctxt,
- dto_q);
- list_del_init(&ctxt->dto_q);
+ ctxt = list_first_entry(&rdma->sc_rq_dto_q,
+ struct svc_rdma_op_ctxt, list);
+ list_del(&ctxt->list);
svc_rdma_put_context(ctxt, 1);
}
{
struct sk_buff_head xmitq;
struct tipc_node *n;
- struct tipc_msg *hdr = buf_msg(skb);
- int usr = msg_user(hdr);
+ struct tipc_msg *hdr;
int bearer_id = b->identity;
struct tipc_link_entry *le;
- u16 bc_ack = msg_bcast_ack(hdr);
u32 self = tipc_own_addr(net);
- int rc = 0;
+ int usr, rc = 0;
+ u16 bc_ack;
__skb_queue_head_init(&xmitq);
- /* Ensure message is well-formed */
+ /* Ensure message is well-formed before touching the header */
if (unlikely(!tipc_msg_validate(skb)))
goto discard;
+ hdr = buf_msg(skb);
+ usr = msg_user(hdr);
+ bc_ack = msg_bcast_ack(hdr);
/* Handle arrival of discovery or broadcast packet */
if (unlikely(msg_non_seq(hdr))) {
return mtu ? : dst_mtu(dst->path);
}
-static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
- struct sk_buff *skb,
- const void *daddr)
-{
- return dst->path->ops->neigh_lookup(dst, skb, daddr);
-}
-
-static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr)
+static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst,
+ const void *daddr)
{
const struct dst_entry *path = dst->path;
else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR))
daddr = &xfrm->id.daddr;
}
+ return daddr;
+}
+
+static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
+ struct sk_buff *skb,
+ const void *daddr)
+{
+ const struct dst_entry *path = dst->path;
+
+ if (!skb)
+ daddr = xfrm_get_dst_nexthop(dst, daddr);
+ return path->ops->neigh_lookup(path, skb, daddr);
+}
+
+static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr)
+{
+ const struct dst_entry *path = dst->path;
+
+ daddr = xfrm_get_dst_nexthop(dst, daddr);
path->ops->confirm_neigh(path, daddr);
}
clean:
$(call QUIET_CLEAN, fixdep)
- $(Q)find . -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
- $(Q)rm -f fixdep
+ $(Q)find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
+ $(Q)rm -f $(OUTPUT)fixdep
$(OUTPUT)fixdep-in.o: FORCE
$(Q)$(MAKE) $(build)=fixdep
fixdep:
$(Q)$(MAKE) -C $(srctree)/tools/build CFLAGS= LDFLAGS= $(OUTPUT)fixdep
+fixdep-clean:
+ $(Q)$(MAKE) -C $(srctree)/tools/build clean
+
.PHONY: fixdep
}
/**
- * pevent_data_prempt_count - parse the preempt count from the record
+ * pevent_data_preempt_count - parse the preempt count from the record
* @pevent: a handle to the pevent
* @rec: the record to parse
*
* This returns the preempt count from a record.
*/
-int pevent_data_prempt_count(struct pevent *pevent, struct pevent_record *rec)
+int pevent_data_preempt_count(struct pevent *pevent, struct pevent_record *rec)
{
return parse_common_pc(pevent, rec->data);
}
int pevent_data_type(struct pevent *pevent, struct pevent_record *rec);
struct event_format *pevent_data_event_from_type(struct pevent *pevent, int type);
int pevent_data_pid(struct pevent *pevent, struct pevent_record *rec);
-int pevent_data_prempt_count(struct pevent *pevent, struct pevent_record *rec);
+int pevent_data_preempt_count(struct pevent *pevent, struct pevent_record *rec);
int pevent_data_flags(struct pevent *pevent, struct pevent_record *rec);
const char *pevent_data_comm_from_pid(struct pevent *pevent, int pid);
struct cmdline;
#define INSN_CALL_DYNAMIC 8
#define INSN_RETURN 9
#define INSN_CONTEXT_SWITCH 10
-#define INSN_BUG 11
-#define INSN_NOP 12
-#define INSN_OTHER 13
+#define INSN_NOP 11
+#define INSN_OTHER 12
#define INSN_LAST INSN_OTHER
int arch_decode_instruction(struct elf *elf, struct section *sec,
op2 == 0x35)
/* sysenter, sysret */
*type = INSN_CONTEXT_SWITCH;
- else if (op2 == 0x0b || op2 == 0xb9)
- /* ud2 */
- *type = INSN_BUG;
else if (op2 == 0x0d || op2 == 0x1f)
/* nopl/nopw */
*type = INSN_NOP;
unsigned int len, state;
unsigned char type;
unsigned long immediate;
- bool alt_group, visited;
+ bool alt_group, visited, dead_end;
struct symbol *call_dest;
struct instruction *jump_dest;
struct list_head alts;
return 0;
}
+/*
+ * Find all uses of the unreachable() macro, which are code path dead ends.
+ */
+static int add_dead_ends(struct objtool_file *file)
+{
+ struct section *sec;
+ struct rela *rela;
+ struct instruction *insn;
+ bool found;
+
+ sec = find_section_by_name(file->elf, ".rela__unreachable");
+ if (!sec)
+ return 0;
+
+ list_for_each_entry(rela, &sec->rela_list, list) {
+ if (rela->sym->type != STT_SECTION) {
+ WARN("unexpected relocation symbol type in .rela__unreachable");
+ return -1;
+ }
+ insn = find_insn(file, rela->sym->sec, rela->addend);
+ if (insn)
+ insn = list_prev_entry(insn, list);
+ else if (rela->addend == rela->sym->sec->len) {
+ found = false;
+ list_for_each_entry_reverse(insn, &file->insn_list, list) {
+ if (insn->sec == rela->sym->sec) {
+ found = true;
+ break;
+ }
+ }
+
+ if (!found) {
+ WARN("can't find unreachable insn at %s+0x%x",
+ rela->sym->sec->name, rela->addend);
+ return -1;
+ }
+ } else {
+ WARN("can't find unreachable insn at %s+0x%x",
+ rela->sym->sec->name, rela->addend);
+ return -1;
+ }
+
+ insn->dead_end = true;
+ }
+
+ return 0;
+}
+
/*
* Warnings shouldn't be reported for ignored functions.
*/
if (ret)
return ret;
+ ret = add_dead_ends(file);
+ if (ret)
+ return ret;
+
add_ignores(file);
ret = add_jump_destinations(file);
return 0;
- case INSN_BUG:
- return 0;
-
default:
break;
}
+ if (insn->dead_end)
+ return 0;
+
insn = next_insn_same_sec(file, insn);
if (!insn) {
WARN("%s: unexpected end of section", sec->name);
--verbose::
Be more verbose. (Show symbol address, etc)
+-q::
+--quiet::
+ Do not show any message. (Suppress -v)
+
-D::
--dump-raw-trace::
Dump raw trace in ASCII.
Be verbose, for instance, show the raw counts in addition to the
diff.
+-q::
+--quiet::
+ Do not show any message. (Suppress -v)
+
-f::
--force::
Don't do ownership validation.
-a::
--all-cpus::
- System-wide collection from all CPUs.
+ System-wide collection from all CPUs (default if no target is specified).
-p::
--pid=::
--verbose::
Be more verbose. (show symbol address, etc)
+-q::
+--quiet::
+ Do not show any message. (Suppress -v)
+
-n::
--show-nr-samples::
Show the number of samples for each symbol
-a::
--all-cpus::
- system-wide collection from all CPUs
+ system-wide collection from all CPUs (default if no target is specified)
-c::
--scale::
PYTHON_EMBED_LDOPTS := $(shell $(PYTHON_CONFIG_SQ) --ldflags 2>/dev/null)
PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null)
+ifeq ($(CC), clang)
+ PYTHON_EMBED_CCOPTS := $(filter-out -specs=%,$(PYTHON_EMBED_CCOPTS))
+endif
+
FEATURE_CHECK_CFLAGS-libpython := $(PYTHON_EMBED_CCOPTS)
FEATURE_CHECK_LDFLAGS-libpython := $(PYTHON_EMBED_LDOPTS)
FEATURE_CHECK_CFLAGS-libpython-version := $(PYTHON_EMBED_CCOPTS)
PYTHON_EMBED_LDFLAGS := $(call strip-libs,$(PYTHON_EMBED_LDOPTS))
PYTHON_EMBED_LIBADD := $(call grep-libs,$(PYTHON_EMBED_LDOPTS)) -lutil
PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null)
+ ifeq ($(CC), clang)
+ PYTHON_EMBED_CCOPTS := $(filter-out -specs=%,$(PYTHON_EMBED_CCOPTS))
+ endif
FLAGS_PYTHON_EMBED := $(PYTHON_EMBED_CCOPTS) $(PYTHON_EMBED_LDOPTS)
ifneq ($(feature-libpython), 1)
$(call QUIET_CLEAN, config)
$(Q)$(MAKE) -C $(srctree)/tools/build/feature/ $(if $(OUTPUT),OUTPUT=$(OUTPUT)feature/,) clean >/dev/null
-clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean config-clean
+clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean config-clean fixdep-clean
$(call QUIET_CLEAN, core-objs) $(RM) $(LIB_FILE) $(OUTPUT)perf-archive $(OUTPUT)perf-with-kcore $(LANG_BINDINGS)
$(Q)find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
$(Q)$(RM) $(OUTPUT).config-detected
$(call QUIET_CLEAN, core-progs) $(RM) $(ALL_PROGRAMS) perf perf-read-vdso32 perf-read-vdsox32 $(OUTPUT)pmu-events/jevents $(OUTPUT)$(LIBJVMTI).so
$(call QUIET_CLEAN, core-gen) $(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope* $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)FEATURE-DUMP $(OUTPUT)util/*-bison* $(OUTPUT)util/*-flex* \
- $(OUTPUT)util/intel-pt-decoder/inat-tables.c $(OUTPUT)fixdep \
+ $(OUTPUT)util/intel-pt-decoder/inat-tables.c \
$(OUTPUT)tests/llvm-src-{base,kbuild,prologue,relocation}.c \
$(OUTPUT)pmu-events/pmu-events.c
$(QUIET_SUBDIR0)Documentation $(QUIET_SUBDIR1) clean
OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"),
OPT_INCR('v', "verbose", &verbose,
"be more verbose (show symbol address, etc)"),
+ OPT_BOOLEAN('q', "quiet", &quiet, "do now show any message"),
OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
"dump raw trace in ASCII"),
OPT_BOOLEAN(0, "gtk", &annotate.use_gtk, "Use the GTK interface"),
annotate.sym_hist_filter = argv[0];
}
+ if (quiet)
+ perf_quiet_option();
+
file.path = input_name;
annotate.session = perf_session__new(&file, false, &annotate.tool);
hists__precompute(hists);
hists__output_resort(hists, NULL);
- hists__fprintf(hists, true, 0, 0, 0, stdout,
+ hists__fprintf(hists, !quiet, 0, 0, 0, stdout,
symbol_conf.use_callchain);
}
hists__link(hists_base, hists);
}
- fprintf(stdout, "%s# Event '%s'\n#\n", first ? "" : "\n",
- perf_evsel__name(evsel_base));
+ if (!quiet) {
+ fprintf(stdout, "%s# Event '%s'\n#\n", first ? "" : "\n",
+ perf_evsel__name(evsel_base));
+ }
first = false;
- if (verbose || data__files_cnt > 2)
+ if (verbose > 0 || ((data__files_cnt > 2) && !quiet))
data__fprintf();
/* Don't sort callchain for perf diff */
static const struct option options[] = {
OPT_INCR('v', "verbose", &verbose,
"be more verbose (show symbol address, etc)"),
+ OPT_BOOLEAN('q', "quiet", &quiet, "Do not show any message"),
OPT_BOOLEAN('b', "baseline-only", &show_baseline_only,
"Show only items with match in baseline"),
OPT_CALLBACK('c', "compute", &compute,
argc = parse_options(argc, argv, options, diff_usage, 0);
+ if (quiet)
+ perf_quiet_option();
+
if (symbol__init(NULL) < 0)
return -1;
fprintf(stderr, "%-13s%-*s%s\n",
e->tag,
- verbose ? 25 : 0,
- verbose ? perf_mem_events__name(j) : "",
+ verbose > 0 ? 25 : 0,
+ verbose > 0 ? perf_mem_events__name(j) : "",
e->supported ? ": available" : "");
}
exit(0);
try_again:
if (perf_evsel__open(pos, pos->cpus, pos->threads) < 0) {
if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
- if (verbose)
+ if (verbose > 0)
ui__warning("%s\n", msg);
goto try_again;
}
argc = parse_options(argc, argv, record_options, record_usage,
PARSE_OPT_STOP_AT_NON_OPTION);
+ if (quiet)
+ perf_quiet_option();
+
+ /* Make system wide (-a) the default target. */
if (!argc && target__none(&rec->opts.target))
- usage_with_options(record_usage, record_options);
+ rec->opts.target.system_wide = true;
if (nr_cgroups && !rec->opts.target.system_wide) {
usage_with_options_msg(record_usage, record_options,
size_t size = sizeof(buf);
int socked_id = hists->socket_filter;
+ if (quiet)
+ return 0;
+
if (symbol_conf.filter_relative) {
nr_samples = hists->stats.nr_non_filtered_samples;
nr_events = hists->stats.total_non_filtered_period;
{
struct perf_evsel *pos;
- fprintf(stdout, "#\n# Total Lost Samples: %" PRIu64 "\n#\n", evlist->stats.total_lost_samples);
+ if (!quiet) {
+ fprintf(stdout, "#\n# Total Lost Samples: %" PRIu64 "\n#\n",
+ evlist->stats.total_lost_samples);
+ }
+
evlist__for_each_entry(evlist, pos) {
struct hists *hists = evsel__hists(pos);
const char *evname = perf_evsel__name(pos);
continue;
hists__fprintf_nr_sample_events(hists, rep, evname, stdout);
- hists__fprintf(hists, true, 0, 0, rep->min_percent, stdout,
+ hists__fprintf(hists, !quiet, 0, 0, rep->min_percent, stdout,
symbol_conf.use_callchain);
fprintf(stdout, "\n\n");
}
"input file name"),
OPT_INCR('v', "verbose", &verbose,
"be more verbose (show symbol address, etc)"),
+ OPT_BOOLEAN('q', "quiet", &quiet, "Do not show any message"),
OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
"dump raw trace in ASCII"),
OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
report.symbol_filter_str = argv[0];
}
+ if (quiet)
+ perf_quiet_option();
+
if (symbol_conf.vmlinux_name &&
access(symbol_conf.vmlinux_name, R_OK)) {
pr_err("Invalid file: %s\n", symbol_conf.vmlinux_name);
goto error;
}
- if (report.header || report.header_only) {
+ if ((report.header || report.header_only) && !quiet) {
perf_session__fprintf_info(session, stdout,
report.show_full_info);
if (report.header_only) {
ret = 0;
goto error;
}
- } else if (use_browser == 0) {
+ } else if (use_browser == 0 && !quiet) {
fputs("# To display the perf.data header info, please use --header/--header-only options.\n#\n",
stdout);
}
* providing it only in verbose mode not to bloat too
* much struct symbol.
*/
- if (verbose) {
+ if (verbose > 0) {
/*
* XXX: Need to provide a less kludgy way to ask for
* more space per symbol, the u32 is for the index on
BUG_ON(!sched->tasks);
sched->tasks[task->nr] = task;
- if (verbose)
+ if (verbose > 0)
printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm);
return task;
const u32 pid = perf_evsel__intval(evsel, sample, "pid");
struct task_desc *waker, *wakee;
- if (verbose) {
+ if (verbose > 0) {
printf("sched_wakeup event %p\n", evsel);
printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid);
int cpu = sample->cpu;
s64 delta;
- if (verbose)
+ if (verbose > 0)
printf("sched_switch event %p\n", evsel);
if (cpu >= MAX_CPUS || cpu < 0)
goto out_put;
}
- if (verbose) {
+ if (verbose > 0) {
printf("fork event\n");
printf("... parent: %s/%d\n", thread__comm_str(parent), parent->tid);
printf("... child: %s/%d\n", thread__comm_str(child), child->tid);
timestamp__scnprintf_usec(timestamp, stimestamp, sizeof(stimestamp));
color_fprintf(stdout, color, " %12s secs ", stimestamp);
- if (new_shortname || (verbose && sched_in->tid)) {
+ if (new_shortname || (verbose > 0 && sched_in->tid)) {
const char *pid_color = color;
if (thread__has_color(sched_in))
if (thread__resolve_callchain(thread, cursor, evsel, sample,
NULL, NULL, sched->max_stack + 2) != 0) {
- if (verbose)
+ if (verbose > 0)
error("Failed to resolve callchain. Skipping\n");
return;
if (errno == EINVAL || errno == ENOSYS ||
errno == ENOENT || errno == EOPNOTSUPP ||
errno == ENXIO) {
- if (verbose)
+ if (verbose > 0)
ui__warning("%s event is not supported by the kernel.\n",
perf_evsel__name(counter));
counter->supported = false;
!(counter->leader->nr_members > 1))
continue;
} else if (perf_evsel__fallback(counter, errno, msg, sizeof(msg))) {
- if (verbose)
+ if (verbose > 0)
ui__warning("%s\n", msg);
goto try_again;
}
cpu = map->map[idx];
- if (cpu >= env->nr_cpus_online)
+ if (cpu >= env->nr_cpus_avail)
return -1;
return cpu;
} else if (big_num_opt == 0) /* User passed --no-big-num */
big_num = false;
+ /* Make system wide (-a) the default target. */
if (!argc && target__none(&target))
- usage_with_options(stat_usage, stat_options);
+ target.system_wide = true;
if (run_count < 0) {
pr_err("Run count must be a positive number\n");
status = 0;
for (run_idx = 0; forever || run_idx < run_count; run_idx++) {
- if (run_count != 1 && verbose)
+ if (run_count != 1 && verbose > 0)
fprintf(output, "[ perf stat: executing run #%d ... ]\n",
run_idx + 1);
if (perf_evsel__open(counter, top->evlist->cpus,
top->evlist->threads) < 0) {
if (perf_evsel__fallback(counter, errno, msg, sizeof(msg))) {
- if (verbose)
+ if (verbose > 0)
ui__warning("%s\n", msg);
goto try_again;
}
return &trace->syscalls.table[id];
out_cant_read:
- if (verbose) {
+ if (verbose > 0) {
fprintf(trace->output, "Problems reading syscall %d", id);
if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
bool print_dso, bool print_sym)
{
- if ((verbose || print_dso) && al->map)
+ if ((verbose > 0 || print_dso) && al->map)
fprintf(f, "%s@", al->map->dso->long_name);
- if ((verbose || print_sym) && al->sym)
+ if ((verbose > 0 || print_sym) && al->sym)
fprintf(f, "%s+0x%" PRIx64, al->sym->name,
al->addr - al->sym->start);
else if (al->map)
int err;
int fd = open(fn, O_RDONLY);
- if (fd < 0 && verbose && fn) {
+ if (fd < 0 && verbose > 0 && fn) {
pr_err("Error opening events file '%s': %s\n", fn,
strerror(errno));
}
int vcnt = min(verbose, (int) sizeof(v) - 1);
char cmd[3*PATH_MAX];
- if (verbose)
+ if (verbose > 0)
vcnt++;
snprintf(cmd, 3*PATH_MAX, PYTHON " %s/attr.py -d %s/attr/ -p %s %.*s",
if (!dont_fork) {
pr_debug("test child forked, pid %d\n", getpid());
- if (!verbose) {
+ if (verbose <= 0) {
int nullfd = open("/dev/null", O_WRONLY);
if (nullfd >= 0) {
continue;
}
- if (verbose) {
+ if (verbose > 0) {
char errbuf[512];
perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
pr_debug("perf_evlist__open() failed!\n%s\n", errbuf);
{
int printed = 0;
- if (!verbose)
+ if (verbose <= 0)
return 0;
printed += fprintf(fp, "\n%s: ", prefix);
* Skip this test if user's .perfconfig doesn't set [llvm] section
* and clang is not found in $PATH, and this is not perf test -v
*/
- if (!force && (verbose == 0 &&
+ if (!force && (verbose <= 0 &&
!llvm_param.user_set_param &&
llvm__search_clang())) {
pr_debug("No clang and no verbosive, skip this test\n");
{
char msg[1024];
- if (!verbose)
+ if (verbose <= 0)
return;
vsnprintf(msg, sizeof(msg), warn, params);
err = perf_evlist__parse_sample(evlist, event, &sample);
if (err < 0) {
- if (verbose)
+ if (verbose > 0)
perf_event__fprintf(event, stderr);
pr_debug("Couldn't parse sample\n");
goto out_delete_evlist;
}
- if (verbose) {
+ if (verbose > 0) {
pr_info("%" PRIu64" %d ", sample.time, sample.cpu);
perf_event__fprintf(event, stderr);
}
int ret;
if (asprintf(&cmd, "echo \"import sys ; sys.path.append('%s'); import perf\" | %s %s",
- PYTHONPATH, PYTHON, verbose ? "" : "2> /dev/null") < 0)
+ PYTHONPATH, PYTHON, verbose > 0 ? "" : "2> /dev/null") < 0)
return -1;
ret = system(cmd) ? -1 : 0;
TEST_ASSERT_VAL("failed to allocate thread_map",
threads);
- if (verbose)
+ if (verbose > 0)
thread_map__fprintf(threads, stderr);
TEST_ASSERT_VAL("failed to remove thread",
TEST_ASSERT_VAL("thread_map count != 1", threads->nr == 1);
- if (verbose)
+ if (verbose > 0)
thread_map__fprintf(threads, stderr);
TEST_ASSERT_VAL("failed to remove thread",
TEST_ASSERT_VAL("thread_map count != 0", threads->nr == 0);
- if (verbose)
+ if (verbose > 0)
thread_map__fprintf(threads, stderr);
TEST_ASSERT_VAL("failed to not remove thread",
session = perf_session__new(&file, false, NULL);
TEST_ASSERT_VAL("can't get session", session);
- for (i = 0; i < session->header.env.nr_cpus_online; i++) {
+ for (i = 0; i < session->header.env.nr_cpus_avail; i++) {
+ if (!cpu_map__has(map, i))
+ continue;
pr_debug("CPU %d, core %d, socket %d\n", i,
session->header.env.cpu[i].core_id,
session->header.env.cpu[i].socket_id);
err = -1;
}
- if (!verbose)
+ if (verbose <= 0)
goto out;
header_printed = false;
if (ui_browser__show(&browser->b, browser->map->dso->long_name,
"Press ESC to exit, %s / to search",
- verbose ? "" : "restart with -v to use") < 0)
+ verbose > 0 ? "" : "restart with -v to use") < 0)
return -1;
while (1) {
switch (key) {
case '/':
- if (verbose)
+ if (verbose > 0)
map_browser__search(browser);
default:
break;
if (maxaddr < pos->end)
maxaddr = pos->end;
- if (verbose) {
+ if (verbose > 0) {
u32 *idx = symbol__browser_index(pos);
*idx = mb.b.nr_entries;
}
ret += fmt->width(fmt, &dummy_hpp, hists);
}
- if (verbose && hists__has(hists, sym)) /* Addr + origin */
+ if (verbose > 0 && hists__has(hists, sym)) /* Addr + origin */
ret += 3 + BITS_PER_LONG / 4;
return ret;
printf("%-*.*s----\n",
graph_dotted_len, graph_dotted_len, graph_dotted_line);
- if (verbose)
+ if (verbose > 0)
symbol__annotate_hits(sym, evsel);
list_for_each_entry(pos, ¬es->src->source, node) {
#include "asm/bug.h"
static int max_cpu_num;
+static int max_present_cpu_num;
static int max_node_num;
static int *cpunode_map;
/* set up default */
max_cpu_num = 4096;
+ max_present_cpu_num = 4096;
mnt = sysfs__mountpoint();
if (!mnt)
}
ret = get_max_num(path, &max_cpu_num);
+ if (ret)
+ goto out;
+
+ /* get the highest present cpu number for a sparse allocation */
+ ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt);
+ if (ret == PATH_MAX) {
+ pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX);
+ goto out;
+ }
+
+ ret = get_max_num(path, &max_present_cpu_num);
out:
if (ret)
return max_cpu_num;
}
+int cpu__max_present_cpu(void)
+{
+ if (unlikely(!max_present_cpu_num))
+ set_max_cpu_num();
+
+ return max_present_cpu_num;
+}
+
+
int cpu__get_node(int cpu)
{
if (unlikely(cpunode_map == NULL)) {
int cpu__max_node(void);
int cpu__max_cpu(void);
+int cpu__max_present_cpu(void);
int cpu__get_node(int cpu);
int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
v = (v < 0) || (v > 10) ? 0 : v;
}
+ if (quiet)
+ v = -1;
+
*var->ptr = v;
free(s);
return 0;
}
+int perf_quiet_option(void)
+{
+ struct debug_variable *var = &debug_variables[0];
+
+ /* disable all debug messages */
+ while (var->name) {
+ *var->ptr = -1;
+ var++;
+ }
+
+ quiet = true;
+ return 0;
+}
+
#define DEBUG_WRAPPER(__n, __l) \
static int pr_ ## __n ## _wrapper(const char *fmt, ...) \
{ \
int perf_debug_option(const char *str);
void perf_debug_setup(void);
+int perf_quiet_option(void);
#endif /* __PERF_DEBUG_H */
{
if (!dso)
return strlen("[unknown]");
- if (verbose)
+ if (verbose > 0)
return dso->long_name_len;
return dso->short_name_len;
return 0;
if (env->nr_cpus_avail == 0)
- env->nr_cpus_avail = sysconf(_SC_NPROCESSORS_CONF);
+ env->nr_cpus_avail = cpu__max_present_cpu();
nr_cpus = env->nr_cpus_avail;
if (nr_cpus == -1)
u32 nrc, nra;
int ret;
- nr = sysconf(_SC_NPROCESSORS_CONF);
- if (nr < 0)
- return -1;
-
- nrc = (u32)(nr & UINT_MAX);
+ nrc = cpu__max_present_cpu();
nr = sysconf(_SC_NPROCESSORS_ONLN);
if (nr < 0)
static struct cpu_topo *build_cpu_topology(void)
{
- struct cpu_topo *tp;
+ struct cpu_topo *tp = NULL;
void *addr;
u32 nr, i;
size_t sz;
long ncpus;
int ret = -1;
+ struct cpu_map *map;
- ncpus = sysconf(_SC_NPROCESSORS_CONF);
- if (ncpus < 0)
+ ncpus = cpu__max_present_cpu();
+
+ /* build online CPU map */
+ map = cpu_map__new(NULL);
+ if (map == NULL) {
+ pr_debug("failed to get system cpumap\n");
return NULL;
+ }
nr = (u32)(ncpus & UINT_MAX);
sz = nr * sizeof(char *);
-
addr = calloc(1, sizeof(*tp) + 2 * sz);
if (!addr)
- return NULL;
+ goto out_free;
tp = addr;
tp->cpu_nr = nr;
tp->thread_siblings = addr;
for (i = 0; i < nr; i++) {
+ if (!cpu_map__has(map, i))
+ continue;
+
ret = build_cpu_topo(tp, i);
if (ret < 0)
break;
}
+
+out_free:
+ cpu_map__put(map);
if (ret) {
free_cpu_topo(tp);
tp = NULL;
{
int nr, i;
char *str;
- int cpu_nr = ph->env.nr_cpus_online;
+ int cpu_nr = ph->env.nr_cpus_avail;
nr = ph->env.nr_sibling_cores;
str = ph->env.sibling_cores;
u32 nr, i;
char *str;
struct strbuf sb;
- int cpu_nr = ph->env.nr_cpus_online;
+ int cpu_nr = ph->env.nr_cpus_avail;
u64 size = 0;
ph->env.cpu = calloc(cpu_nr, sizeof(*ph->env.cpu));
if (ph->needs_swap)
nr = bswap_32(nr);
- if (nr > (u32)cpu_nr) {
+ if (nr != (u32)-1 && nr > (u32)cpu_nr) {
pr_debug("socket_id number is too big."
"You may need to upgrade the perf tool.\n");
goto free_cpu;
*/
if (h->ms.sym) {
symlen = h->ms.sym->namelen + 4;
- if (verbose)
+ if (verbose > 0)
symlen += BITS_PER_LONG / 4 + 2 + 3;
hists__new_col_len(hists, HISTC_SYMBOL, symlen);
} else {
if (h->branch_info) {
if (h->branch_info->from.sym) {
symlen = (int)h->branch_info->from.sym->namelen + 4;
- if (verbose)
+ if (verbose > 0)
symlen += BITS_PER_LONG / 4 + 2 + 3;
hists__new_col_len(hists, HISTC_SYMBOL_FROM, symlen);
if (h->branch_info->to.sym) {
symlen = (int)h->branch_info->to.sym->namelen + 4;
- if (verbose)
+ if (verbose > 0)
symlen += BITS_PER_LONG / 4 + 2 + 3;
hists__new_col_len(hists, HISTC_SYMBOL_TO, symlen);
return term->type_term != PARSE_EVENTS__TERM_TYPE_USER;
}
-static int new_term(struct parse_events_term **_term, int type_val,
- int type_term, char *config,
- char *str, u64 num, int err_term, int err_val)
+static int new_term(struct parse_events_term **_term,
+ struct parse_events_term *temp,
+ char *str, u64 num)
{
struct parse_events_term *term;
- term = zalloc(sizeof(*term));
+ term = malloc(sizeof(*term));
if (!term)
return -ENOMEM;
+ *term = *temp;
INIT_LIST_HEAD(&term->list);
- term->type_val = type_val;
- term->type_term = type_term;
- term->config = config;
- term->err_term = err_term;
- term->err_val = err_val;
- switch (type_val) {
+ switch (term->type_val) {
case PARSE_EVENTS__TERM_TYPE_NUM:
term->val.num = num;
break;
int parse_events_term__num(struct parse_events_term **term,
int type_term, char *config, u64 num,
+ bool no_value,
void *loc_term_, void *loc_val_)
{
YYLTYPE *loc_term = loc_term_;
YYLTYPE *loc_val = loc_val_;
- return new_term(term, PARSE_EVENTS__TERM_TYPE_NUM, type_term,
- config, NULL, num,
- loc_term ? loc_term->first_column : 0,
- loc_val ? loc_val->first_column : 0);
+ struct parse_events_term temp = {
+ .type_val = PARSE_EVENTS__TERM_TYPE_NUM,
+ .type_term = type_term,
+ .config = config,
+ .no_value = no_value,
+ .err_term = loc_term ? loc_term->first_column : 0,
+ .err_val = loc_val ? loc_val->first_column : 0,
+ };
+
+ return new_term(term, &temp, NULL, num);
}
int parse_events_term__str(struct parse_events_term **term,
YYLTYPE *loc_term = loc_term_;
YYLTYPE *loc_val = loc_val_;
- return new_term(term, PARSE_EVENTS__TERM_TYPE_STR, type_term,
- config, str, 0,
- loc_term ? loc_term->first_column : 0,
- loc_val ? loc_val->first_column : 0);
+ struct parse_events_term temp = {
+ .type_val = PARSE_EVENTS__TERM_TYPE_STR,
+ .type_term = type_term,
+ .config = config,
+ .err_term = loc_term ? loc_term->first_column : 0,
+ .err_val = loc_val ? loc_val->first_column : 0,
+ };
+
+ return new_term(term, &temp, str, 0);
}
int parse_events_term__sym_hw(struct parse_events_term **term,
char *config, unsigned idx)
{
struct event_symbol *sym;
+ struct parse_events_term temp = {
+ .type_val = PARSE_EVENTS__TERM_TYPE_STR,
+ .type_term = PARSE_EVENTS__TERM_TYPE_USER,
+ .config = config ?: (char *) "event",
+ };
BUG_ON(idx >= PERF_COUNT_HW_MAX);
sym = &event_symbols_hw[idx];
- if (config)
- return new_term(term, PARSE_EVENTS__TERM_TYPE_STR,
- PARSE_EVENTS__TERM_TYPE_USER, config,
- (char *) sym->symbol, 0, 0, 0);
- else
- return new_term(term, PARSE_EVENTS__TERM_TYPE_STR,
- PARSE_EVENTS__TERM_TYPE_USER,
- (char *) "event", (char *) sym->symbol,
- 0, 0, 0);
+ return new_term(term, &temp, (char *) sym->symbol, 0);
}
int parse_events_term__clone(struct parse_events_term **new,
struct parse_events_term *term)
{
- return new_term(new, term->type_val, term->type_term, term->config,
- term->val.str, term->val.num,
- term->err_term, term->err_val);
+ struct parse_events_term temp = {
+ .type_val = term->type_val,
+ .type_term = term->type_term,
+ .config = term->config,
+ .err_term = term->err_term,
+ .err_val = term->err_val,
+ };
+
+ return new_term(new, &temp, term->val.str, term->val.num);
}
void parse_events_terms__purge(struct list_head *terms)
int type_term;
struct list_head list;
bool used;
+ bool no_value;
/* error string indexes for within parsed string */
int err_term;
int parse_events__is_hardcoded_term(struct parse_events_term *term);
int parse_events_term__num(struct parse_events_term **term,
int type_term, char *config, u64 num,
+ bool novalue,
void *loc_term, void *loc_val);
int parse_events_term__str(struct parse_events_term **term,
int type_term, char *config, char *str,
if (!strcasecmp(alias->name, $1)) {
ALLOC_LIST(head);
ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
- $1, 1, &@1, NULL));
+ $1, 1, false, &@1, NULL));
list_add_tail(&term->list, head);
if (!parse_events_add_pmu(data, list,
ALLOC_LIST(head);
ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
- &pmu_name, 1, &@1, NULL));
+ &pmu_name, 1, false, &@1, NULL));
list_add_tail(&term->list, head);
ALLOC_LIST(list);
struct parse_events_term *term;
ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
- $1, $3, &@1, &@3));
+ $1, $3, false, &@1, &@3));
$$ = term;
}
|
struct parse_events_term *term;
ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
- $1, 1, &@1, NULL));
+ $1, 1, true, &@1, NULL));
$$ = term;
}
|
{
struct parse_events_term *term;
- ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, $3, &@1, &@3));
+ ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, $3, false, &@1, &@3));
$$ = term;
}
|
{
struct parse_events_term *term;
- ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, 1, &@1, NULL));
+ ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, 1, true, &@1, NULL));
$$ = term;
}
|
struct parse_events_term *term;
ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
- $1, $4, &@1, &@4));
+ $1, $4, false, &@1, &@4));
term->array = $2;
$$ = term;
}
}
}
- if (verbose)
+ if (verbose > 0)
printf("Required parameter '%s' not specified\n", term->config);
return -1;
format = pmu_find_format(formats, term->config);
if (!format) {
- if (verbose)
+ if (verbose > 0)
printf("Invalid event/parameter '%s'\n", term->config);
if (err) {
char *pmu_term = pmu_formats_string(formats);
* Either directly use a numeric term, or try to translate string terms
* using event parameters.
*/
- if (term->type_val == PARSE_EVENTS__TERM_TYPE_NUM)
+ if (term->type_val == PARSE_EVENTS__TERM_TYPE_NUM) {
+ if (term->no_value &&
+ bitmap_weight(format->bits, PERF_PMU_FORMAT_BITS) > 1) {
+ if (err) {
+ err->idx = term->err_val;
+ err->str = strdup("no value assigned for term");
+ }
+ return -EINVAL;
+ }
+
val = term->val.num;
- else if (term->type_val == PARSE_EVENTS__TERM_TYPE_STR) {
+ } else if (term->type_val == PARSE_EVENTS__TERM_TYPE_STR) {
if (strcmp(term->val.str, "?")) {
- if (verbose) {
+ if (verbose > 0) {
pr_info("Invalid sysfs entry %s=%s\n",
term->config, term->val.str);
}
printf("%*s", 8, "[");
wordwrap(aliases[j].desc, 8, columns, 0);
printf("]\n");
- if (verbose)
+ if (verbose > 0)
printf("%*s%s/%s/\n", 8, "", aliases[j].pmu, aliases[j].str);
} else
printf(" %-50s [Kernel PMU event]\n", aliases[j].name);
pr_debug("try to find information at %" PRIx64 " in %s\n", addr,
tp->module ? : "kernel");
- dinfo = debuginfo_cache__open(tp->module, verbose == 0);
+ dinfo = debuginfo_cache__open(tp->module, verbose <= 0);
if (dinfo)
ret = debuginfo__find_probe_point(dinfo,
(unsigned long)addr, pp);
if (node->map) {
struct map *map = node->map;
const char *dsoname = "[unknown]";
- if (map && map->dso && (map->dso->name || map->dso->long_name)) {
+ if (map && map->dso) {
if (symbol_conf.show_kernel_path && map->dso->long_name)
dsoname = map->dso->long_name;
- else if (map->dso->name)
+ else
dsoname = map->dso->name;
}
pydict_set_item_string_decref(pyelem, "dso",
printf("..... %2"PRIu64": %016" PRIx64 " -> %016" PRIx64 " %hu cycles %s%s%s%s %x\n",
i, e->from, e->to,
- e->flags.cycles,
+ (unsigned short)e->flags.cycles,
e->flags.mispred ? "M" : " ",
e->flags.predicted ? "P" : " ",
e->flags.abort ? "A" : " ",
#!/usr/bin/python2
-from distutils.core import setup, Extension
from os import getenv
+cc = getenv("CC")
+if cc == "clang":
+ from _sysconfigdata import build_time_vars
+ from re import sub
+ build_time_vars["CFLAGS"] = sub("-specs=[^ ]+", "", build_time_vars["CFLAGS"])
+
+from distutils.core import setup, Extension
+
from distutils.command.build_ext import build_ext as _build_ext
from distutils.command.install_lib import install_lib as _install_lib
if (!dso_l || !dso_r)
return cmp_null(dso_r, dso_l);
- if (verbose) {
+ if (verbose > 0) {
dso_name_l = dso_l->long_name;
dso_name_r = dso_r->long_name;
} else {
size_t size, unsigned int width)
{
if (map && map->dso) {
- const char *dso_name = !verbose ? map->dso->short_name :
- map->dso->long_name;
+ const char *dso_name = verbose > 0 ? map->dso->long_name :
+ map->dso->short_name;
return repsep_snprintf(bf, size, "%-*.*s", width, width, dso_name);
}
{
size_t ret = 0;
- if (verbose) {
+ if (verbose > 0) {
char o = map ? dso__symtab_origin(map->dso) : '!';
ret += repsep_snprintf(bf, size, "%-#*llx %c ",
BITS_PER_LONG / 4 + 2, ip, o);
for (i = 0; i < 3; i++)
update_stats(&ps->res_stats[i], count[i]);
- if (verbose) {
+ if (verbose > 0) {
fprintf(config->output, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n",
perf_evsel__name(counter), count[0], count[1], count[2]);
}
static char *demangle_sym(struct dso *dso, int kmodule, const char *elf_name)
{
- int demangle_flags = verbose ? (DMGL_PARAMS | DMGL_ANSI) : DMGL_NO_OPTS;
+ int demangle_flags = verbose > 0 ? (DMGL_PARAMS | DMGL_ANSI) : DMGL_NO_OPTS;
char *demangled = NULL;
/*