--- /dev/null
-#define local_save_flags(flags) do { typecheck(unsigned long, flags); \
+ #ifndef __UM_SYSTEM_GENERIC_H
+ #define __UM_SYSTEM_GENERIC_H
+
+ #include "sysdep/system.h"
+
+ extern void *switch_to(void *prev, void *next, void *last);
+
+ extern int get_signals(void);
+ extern int set_signals(int enable);
+ extern int get_signals(void);
+ extern void block_signals(void);
+ extern void unblock_signals(void);
+
-#define local_irq_restore(flags) do { typecheck(unsigned long, flags); \
++#define raw_local_save_flags(flags) do { typecheck(unsigned long, flags); \
+ (flags) = get_signals(); } while(0)
-#define local_irq_save(flags) do { local_save_flags(flags); \
- local_irq_disable(); } while(0)
++#define raw_local_irq_restore(flags) do { typecheck(unsigned long, flags); \
+ set_signals(flags); } while(0)
+
-#define local_irq_enable() unblock_signals()
-#define local_irq_disable() block_signals()
++#define raw_local_irq_save(flags) do { raw_local_save_flags(flags); \
++ raw_local_irq_disable(); } while(0)
+
- local_save_flags(flags); \
++#define raw_local_irq_enable() unblock_signals()
++#define raw_local_irq_disable() block_signals()
+
+ #define irqs_disabled() \
+ ({ \
+ unsigned long flags; \
++ raw_local_save_flags(flags); \
+ (flags == 0); \
+ })
+
+ extern void *_switch_to(void *prev, void *next, void *last);
+ #define switch_to(prev, next, last) prev = _switch_to(prev, next, last)
+
+ #endif
--- /dev/null
+ #ifndef _ASM_X86_UACCESS_H
+ #define _ASM_X86_UACCESS_H
+ /*
+ * User space memory access functions
+ */
+ #include <linux/errno.h>
+ #include <linux/compiler.h>
+ #include <linux/thread_info.h>
+ #include <linux/prefetch.h>
+ #include <linux/string.h>
+ #include <asm/asm.h>
+ #include <asm/page.h>
+
+ #define VERIFY_READ 0
+ #define VERIFY_WRITE 1
+
+ /*
+ * The fs value determines whether argument validity checking should be
+ * performed or not. If get_fs() == USER_DS, checking is performed, with
+ * get_fs() == KERNEL_DS, checking is bypassed.
+ *
+ * For historical reasons, these macros are grossly misnamed.
+ */
+
+ #define MAKE_MM_SEG(s) ((mm_segment_t) { (s) })
+
+ #define KERNEL_DS MAKE_MM_SEG(-1UL)
+ #define USER_DS MAKE_MM_SEG(PAGE_OFFSET)
+
+ #define get_ds() (KERNEL_DS)
+ #define get_fs() (current_thread_info()->addr_limit)
+ #define set_fs(x) (current_thread_info()->addr_limit = (x))
+
+ #define segment_eq(a, b) ((a).seg == (b).seg)
+
+ #define __addr_ok(addr) \
+ ((unsigned long __force)(addr) < \
+ (current_thread_info()->addr_limit.seg))
+
+ /*
+ * Test whether a block of memory is a valid user space address.
+ * Returns 0 if the range is valid, nonzero otherwise.
+ *
+ * This is equivalent to the following test:
+ * (u33)addr + (u33)size >= (u33)current->addr_limit.seg (u65 for x86_64)
+ *
+ * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry...
+ */
+
+ #define __range_not_ok(addr, size) \
+ ({ \
+ unsigned long flag, roksum; \
+ __chk_user_ptr(addr); \
+ asm("add %3,%1 ; sbb %0,%0 ; cmp %1,%4 ; sbb $0,%0" \
+ : "=&r" (flag), "=r" (roksum) \
+ : "1" (addr), "g" ((long)(size)), \
+ "rm" (current_thread_info()->addr_limit.seg)); \
+ flag; \
+ })
+
+ /**
+ * access_ok: - Checks if a user space pointer is valid
+ * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE. Note that
+ * %VERIFY_WRITE is a superset of %VERIFY_READ - if it is safe
+ * to write to a block, it is always safe to read from it.
+ * @addr: User space pointer to start of block to check
+ * @size: Size of block to check
+ *
+ * Context: User context only. This function may sleep.
+ *
+ * Checks if a pointer to a block of memory in user space is valid.
+ *
+ * Returns true (nonzero) if the memory block may be valid, false (zero)
+ * if it is definitely invalid.
+ *
+ * Note that, depending on architecture, this function probably just
+ * checks that the pointer is in the user space range - after calling
+ * this function, memory access functions may still return -EFAULT.
+ */
+ #define access_ok(type, addr, size) (likely(__range_not_ok(addr, size) == 0))
+
+ /*
+ * The exception table consists of pairs of addresses: the first is the
+ * address of an instruction that is allowed to fault, and the second is
+ * the address at which the program should continue. No registers are
+ * modified, so it is entirely up to the continuation code to figure out
+ * what to do.
+ *
+ * All the routines below use bits of fixup code that are out of line
+ * with the main instruction path. This means when everything is well,
+ * we don't even have to jump over them. Further, they do not intrude
+ * on our cache or tlb entries.
+ */
+
+ struct exception_table_entry {
+ unsigned long insn, fixup;
+ };
+
+ extern int fixup_exception(struct pt_regs *regs);
+
+ /*
+ * These are the main single-value transfer routines. They automatically
+ * use the right size if we just have the right pointer type.
+ *
+ * This gets kind of ugly. We want to return _two_ values in "get_user()"
+ * and yet we don't want to do any pointers, because that is too much
+ * of a performance impact. Thus we have a few rather ugly macros here,
+ * and hide all the ugliness from the user.
+ *
+ * The "__xxx" versions of the user access functions are versions that
+ * do not verify the address space, that must have been done previously
+ * with a separate "access_ok()" call (this is used when we do multiple
+ * accesses to the same area of user memory).
+ */
+
+ extern int __get_user_1(void);
+ extern int __get_user_2(void);
+ extern int __get_user_4(void);
+ extern int __get_user_8(void);
+ extern int __get_user_bad(void);
+
+ #define __get_user_x(size, ret, x, ptr) \
+ asm volatile("call __get_user_" #size \
+ : "=a" (ret),"=d" (x) \
+ : "0" (ptr)) \
+
+ /* Careful: we have to cast the result to the type of the pointer
+ * for sign reasons */
+
+ /**
+ * get_user: - Get a simple variable from user space.
+ * @x: Variable to store result.
+ * @ptr: Source address, in user space.
+ *
+ * Context: User context only. This function may sleep.
+ *
+ * This macro copies a single simple variable from user space to kernel
+ * space. It supports simple types like char and int, but not larger
+ * data types like structures or arrays.
+ *
+ * @ptr must have pointer-to-simple-variable type, and the result of
+ * dereferencing @ptr must be assignable to @x without a cast.
+ *
+ * Returns zero on success, or -EFAULT on error.
+ * On error, the variable @x is set to zero.
+ */
+ #ifdef CONFIG_X86_32
+ #define __get_user_8(__ret_gu, __val_gu, ptr) \
+ __get_user_x(X, __ret_gu, __val_gu, ptr)
+ #else
+ #define __get_user_8(__ret_gu, __val_gu, ptr) \
+ __get_user_x(8, __ret_gu, __val_gu, ptr)
+ #endif
+
+ #define get_user(x, ptr) \
+ ({ \
+ int __ret_gu; \
+ unsigned long __val_gu; \
+ __chk_user_ptr(ptr); \
++ might_fault(); \
+ switch (sizeof(*(ptr))) { \
+ case 1: \
+ __get_user_x(1, __ret_gu, __val_gu, ptr); \
+ break; \
+ case 2: \
+ __get_user_x(2, __ret_gu, __val_gu, ptr); \
+ break; \
+ case 4: \
+ __get_user_x(4, __ret_gu, __val_gu, ptr); \
+ break; \
+ case 8: \
+ __get_user_8(__ret_gu, __val_gu, ptr); \
+ break; \
+ default: \
+ __get_user_x(X, __ret_gu, __val_gu, ptr); \
+ break; \
+ } \
+ (x) = (__typeof__(*(ptr)))__val_gu; \
+ __ret_gu; \
+ })
+
+ #define __put_user_x(size, x, ptr, __ret_pu) \
+ asm volatile("call __put_user_" #size : "=a" (__ret_pu) \
+ :"0" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
+
+
+
+ #ifdef CONFIG_X86_32
+ #define __put_user_u64(x, addr, err) \
+ asm volatile("1: movl %%eax,0(%2)\n" \
+ "2: movl %%edx,4(%2)\n" \
+ "3:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "4: movl %3,%0\n" \
+ " jmp 3b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE(1b, 4b) \
+ _ASM_EXTABLE(2b, 4b) \
+ : "=r" (err) \
+ : "A" (x), "r" (addr), "i" (-EFAULT), "0" (err))
+
+ #define __put_user_x8(x, ptr, __ret_pu) \
+ asm volatile("call __put_user_8" : "=a" (__ret_pu) \
+ : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
+ #else
+ #define __put_user_u64(x, ptr, retval) \
+ __put_user_asm(x, ptr, retval, "q", "", "Zr", -EFAULT)
+ #define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu)
+ #endif
+
+ extern void __put_user_bad(void);
+
+ /*
+ * Strange magic calling convention: pointer in %ecx,
+ * value in %eax(:%edx), return value in %eax. clobbers %rbx
+ */
+ extern void __put_user_1(void);
+ extern void __put_user_2(void);
+ extern void __put_user_4(void);
+ extern void __put_user_8(void);
+
+ #ifdef CONFIG_X86_WP_WORKS_OK
+
+ /**
+ * put_user: - Write a simple value into user space.
+ * @x: Value to copy to user space.
+ * @ptr: Destination address, in user space.
+ *
+ * Context: User context only. This function may sleep.
+ *
+ * This macro copies a single simple value from kernel space to user
+ * space. It supports simple types like char and int, but not larger
+ * data types like structures or arrays.
+ *
+ * @ptr must have pointer-to-simple-variable type, and @x must be assignable
+ * to the result of dereferencing @ptr.
+ *
+ * Returns zero on success, or -EFAULT on error.
+ */
+ #define put_user(x, ptr) \
+ ({ \
+ int __ret_pu; \
+ __typeof__(*(ptr)) __pu_val; \
+ __chk_user_ptr(ptr); \
++ might_fault(); \
+ __pu_val = x; \
+ switch (sizeof(*(ptr))) { \
+ case 1: \
+ __put_user_x(1, __pu_val, ptr, __ret_pu); \
+ break; \
+ case 2: \
+ __put_user_x(2, __pu_val, ptr, __ret_pu); \
+ break; \
+ case 4: \
+ __put_user_x(4, __pu_val, ptr, __ret_pu); \
+ break; \
+ case 8: \
+ __put_user_x8(__pu_val, ptr, __ret_pu); \
+ break; \
+ default: \
+ __put_user_x(X, __pu_val, ptr, __ret_pu); \
+ break; \
+ } \
+ __ret_pu; \
+ })
+
+ #define __put_user_size(x, ptr, size, retval, errret) \
+ do { \
+ retval = 0; \
+ __chk_user_ptr(ptr); \
+ switch (size) { \
+ case 1: \
+ __put_user_asm(x, ptr, retval, "b", "b", "iq", errret); \
+ break; \
+ case 2: \
+ __put_user_asm(x, ptr, retval, "w", "w", "ir", errret); \
+ break; \
+ case 4: \
+ __put_user_asm(x, ptr, retval, "l", "k", "ir", errret);\
+ break; \
+ case 8: \
+ __put_user_u64((__typeof__(*ptr))(x), ptr, retval); \
+ break; \
+ default: \
+ __put_user_bad(); \
+ } \
+ } while (0)
+
+ #else
+
+ #define __put_user_size(x, ptr, size, retval, errret) \
+ do { \
+ __typeof__(*(ptr))__pus_tmp = x; \
+ retval = 0; \
+ \
+ if (unlikely(__copy_to_user_ll(ptr, &__pus_tmp, size) != 0)) \
+ retval = errret; \
+ } while (0)
+
+ #define put_user(x, ptr) \
+ ({ \
+ int __ret_pu; \
+ __typeof__(*(ptr))__pus_tmp = x; \
+ __ret_pu = 0; \
+ if (unlikely(__copy_to_user_ll(ptr, &__pus_tmp, \
+ sizeof(*(ptr))) != 0)) \
+ __ret_pu = -EFAULT; \
+ __ret_pu; \
+ })
+ #endif
+
+ #ifdef CONFIG_X86_32
+ #define __get_user_asm_u64(x, ptr, retval, errret) (x) = __get_user_bad()
+ #else
+ #define __get_user_asm_u64(x, ptr, retval, errret) \
+ __get_user_asm(x, ptr, retval, "q", "", "=r", errret)
+ #endif
+
+ #define __get_user_size(x, ptr, size, retval, errret) \
+ do { \
+ retval = 0; \
+ __chk_user_ptr(ptr); \
+ switch (size) { \
+ case 1: \
+ __get_user_asm(x, ptr, retval, "b", "b", "=q", errret); \
+ break; \
+ case 2: \
+ __get_user_asm(x, ptr, retval, "w", "w", "=r", errret); \
+ break; \
+ case 4: \
+ __get_user_asm(x, ptr, retval, "l", "k", "=r", errret); \
+ break; \
+ case 8: \
+ __get_user_asm_u64(x, ptr, retval, errret); \
+ break; \
+ default: \
+ (x) = __get_user_bad(); \
+ } \
+ } while (0)
+
+ #define __get_user_asm(x, addr, err, itype, rtype, ltype, errret) \
+ asm volatile("1: mov"itype" %2,%"rtype"1\n" \
+ "2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "3: mov %3,%0\n" \
+ " xor"itype" %"rtype"1,%"rtype"1\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE(1b, 3b) \
+ : "=r" (err), ltype(x) \
+ : "m" (__m(addr)), "i" (errret), "0" (err))
+
+ #define __put_user_nocheck(x, ptr, size) \
+ ({ \
+ long __pu_err; \
+ __put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \
+ __pu_err; \
+ })
+
+ #define __get_user_nocheck(x, ptr, size) \
+ ({ \
+ long __gu_err; \
+ unsigned long __gu_val; \
+ __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \
+ (x) = (__force __typeof__(*(ptr)))__gu_val; \
+ __gu_err; \
+ })
+
+ /* FIXME: this hack is definitely wrong -AK */
+ struct __large_struct { unsigned long buf[100]; };
+ #define __m(x) (*(struct __large_struct __user *)(x))
+
+ /*
+ * Tell gcc we read from memory instead of writing: this is because
+ * we do not write to any memory gcc knows about, so there are no
+ * aliasing issues.
+ */
+ #define __put_user_asm(x, addr, err, itype, rtype, ltype, errret) \
+ asm volatile("1: mov"itype" %"rtype"1,%2\n" \
+ "2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "3: mov %3,%0\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE(1b, 3b) \
+ : "=r"(err) \
+ : ltype(x), "m" (__m(addr)), "i" (errret), "0" (err))
+ /**
+ * __get_user: - Get a simple variable from user space, with less checking.
+ * @x: Variable to store result.
+ * @ptr: Source address, in user space.
+ *
+ * Context: User context only. This function may sleep.
+ *
+ * This macro copies a single simple variable from user space to kernel
+ * space. It supports simple types like char and int, but not larger
+ * data types like structures or arrays.
+ *
+ * @ptr must have pointer-to-simple-variable type, and the result of
+ * dereferencing @ptr must be assignable to @x without a cast.
+ *
+ * Caller must check the pointer with access_ok() before calling this
+ * function.
+ *
+ * Returns zero on success, or -EFAULT on error.
+ * On error, the variable @x is set to zero.
+ */
+
+ #define __get_user(x, ptr) \
+ __get_user_nocheck((x), (ptr), sizeof(*(ptr)))
+ /**
+ * __put_user: - Write a simple value into user space, with less checking.
+ * @x: Value to copy to user space.
+ * @ptr: Destination address, in user space.
+ *
+ * Context: User context only. This function may sleep.
+ *
+ * This macro copies a single simple value from kernel space to user
+ * space. It supports simple types like char and int, but not larger
+ * data types like structures or arrays.
+ *
+ * @ptr must have pointer-to-simple-variable type, and @x must be assignable
+ * to the result of dereferencing @ptr.
+ *
+ * Caller must check the pointer with access_ok() before calling this
+ * function.
+ *
+ * Returns zero on success, or -EFAULT on error.
+ */
+
+ #define __put_user(x, ptr) \
+ __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)))
+
+ #define __get_user_unaligned __get_user
+ #define __put_user_unaligned __put_user
+
+ /*
+ * movsl can be slow when source and dest are not both 8-byte aligned
+ */
+ #ifdef CONFIG_X86_INTEL_USERCOPY
+ extern struct movsl_mask {
+ int mask;
+ } ____cacheline_aligned_in_smp movsl_mask;
+ #endif
+
+ #define ARCH_HAS_NOCACHE_UACCESS 1
+
+ #ifdef CONFIG_X86_32
+ # include "uaccess_32.h"
+ #else
+ # define ARCH_HAS_SEARCH_EXTABLE
+ # include "uaccess_64.h"
+ #endif
+
+ #endif /* _ASM_X86_UACCESS_H */
+
--- /dev/null
- might_sleep();
- return __copy_to_user_inatomic(to, from, n);
+ #ifndef _ASM_X86_UACCESS_32_H
+ #define _ASM_X86_UACCESS_32_H
+
+ /*
+ * User space memory access functions
+ */
+ #include <linux/errno.h>
+ #include <linux/thread_info.h>
+ #include <linux/prefetch.h>
+ #include <linux/string.h>
+ #include <asm/asm.h>
+ #include <asm/page.h>
+
+ unsigned long __must_check __copy_to_user_ll
+ (void __user *to, const void *from, unsigned long n);
+ unsigned long __must_check __copy_from_user_ll
+ (void *to, const void __user *from, unsigned long n);
+ unsigned long __must_check __copy_from_user_ll_nozero
+ (void *to, const void __user *from, unsigned long n);
+ unsigned long __must_check __copy_from_user_ll_nocache
+ (void *to, const void __user *from, unsigned long n);
+ unsigned long __must_check __copy_from_user_ll_nocache_nozero
+ (void *to, const void __user *from, unsigned long n);
+
+ /**
+ * __copy_to_user_inatomic: - Copy a block of data into user space, with less checking.
+ * @to: Destination address, in user space.
+ * @from: Source address, in kernel space.
+ * @n: Number of bytes to copy.
+ *
+ * Context: User context only.
+ *
+ * Copy data from kernel space to user space. Caller must check
+ * the specified block with access_ok() before calling this function.
+ * The caller should also make sure he pins the user space address
+ * so that the we don't result in page fault and sleep.
+ *
+ * Here we special-case 1, 2 and 4-byte copy_*_user invocations. On a fault
+ * we return the initial request size (1, 2 or 4), as copy_*_user should do.
+ * If a store crosses a page boundary and gets a fault, the x86 will not write
+ * anything, so this is accurate.
+ */
+
+ static __always_inline unsigned long __must_check
+ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
+ {
+ if (__builtin_constant_p(n)) {
+ unsigned long ret;
+
+ switch (n) {
+ case 1:
+ __put_user_size(*(u8 *)from, (u8 __user *)to,
+ 1, ret, 1);
+ return ret;
+ case 2:
+ __put_user_size(*(u16 *)from, (u16 __user *)to,
+ 2, ret, 2);
+ return ret;
+ case 4:
+ __put_user_size(*(u32 *)from, (u32 __user *)to,
+ 4, ret, 4);
+ return ret;
+ }
+ }
+ return __copy_to_user_ll(to, from, n);
+ }
+
+ /**
+ * __copy_to_user: - Copy a block of data into user space, with less checking.
+ * @to: Destination address, in user space.
+ * @from: Source address, in kernel space.
+ * @n: Number of bytes to copy.
+ *
+ * Context: User context only. This function may sleep.
+ *
+ * Copy data from kernel space to user space. Caller must check
+ * the specified block with access_ok() before calling this function.
+ *
+ * Returns number of bytes that could not be copied.
+ * On success, this will be zero.
+ */
+ static __always_inline unsigned long __must_check
+ __copy_to_user(void __user *to, const void *from, unsigned long n)
+ {
- might_sleep();
++ might_fault();
++ return __copy_to_user_inatomic(to, from, n);
+ }
+
+ static __always_inline unsigned long
+ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
+ {
+ /* Avoid zeroing the tail if the copy fails..
+ * If 'n' is constant and 1, 2, or 4, we do still zero on a failure,
+ * but as the zeroing behaviour is only significant when n is not
+ * constant, that shouldn't be a problem.
+ */
+ if (__builtin_constant_p(n)) {
+ unsigned long ret;
+
+ switch (n) {
+ case 1:
+ __get_user_size(*(u8 *)to, from, 1, ret, 1);
+ return ret;
+ case 2:
+ __get_user_size(*(u16 *)to, from, 2, ret, 2);
+ return ret;
+ case 4:
+ __get_user_size(*(u32 *)to, from, 4, ret, 4);
+ return ret;
+ }
+ }
+ return __copy_from_user_ll_nozero(to, from, n);
+ }
+
+ /**
+ * __copy_from_user: - Copy a block of data from user space, with less checking.
+ * @to: Destination address, in kernel space.
+ * @from: Source address, in user space.
+ * @n: Number of bytes to copy.
+ *
+ * Context: User context only. This function may sleep.
+ *
+ * Copy data from user space to kernel space. Caller must check
+ * the specified block with access_ok() before calling this function.
+ *
+ * Returns number of bytes that could not be copied.
+ * On success, this will be zero.
+ *
+ * If some data could not be copied, this function will pad the copied
+ * data to the requested size using zero bytes.
+ *
+ * An alternate version - __copy_from_user_inatomic() - may be called from
+ * atomic context and will fail rather than sleep. In this case the
+ * uncopied bytes will *NOT* be padded with zeros. See fs/filemap.h
+ * for explanation of why this is needed.
+ */
+ static __always_inline unsigned long
+ __copy_from_user(void *to, const void __user *from, unsigned long n)
+ {
- might_sleep();
++ might_fault();
+ if (__builtin_constant_p(n)) {
+ unsigned long ret;
+
+ switch (n) {
+ case 1:
+ __get_user_size(*(u8 *)to, from, 1, ret, 1);
+ return ret;
+ case 2:
+ __get_user_size(*(u16 *)to, from, 2, ret, 2);
+ return ret;
+ case 4:
+ __get_user_size(*(u32 *)to, from, 4, ret, 4);
+ return ret;
+ }
+ }
+ return __copy_from_user_ll(to, from, n);
+ }
+
+ static __always_inline unsigned long __copy_from_user_nocache(void *to,
+ const void __user *from, unsigned long n)
+ {
++ might_fault();
+ if (__builtin_constant_p(n)) {
+ unsigned long ret;
+
+ switch (n) {
+ case 1:
+ __get_user_size(*(u8 *)to, from, 1, ret, 1);
+ return ret;
+ case 2:
+ __get_user_size(*(u16 *)to, from, 2, ret, 2);
+ return ret;
+ case 4:
+ __get_user_size(*(u32 *)to, from, 4, ret, 4);
+ return ret;
+ }
+ }
+ return __copy_from_user_ll_nocache(to, from, n);
+ }
+
+ static __always_inline unsigned long
+ __copy_from_user_inatomic_nocache(void *to, const void __user *from,
+ unsigned long n)
+ {
+ return __copy_from_user_ll_nocache_nozero(to, from, n);
+ }
+
+ unsigned long __must_check copy_to_user(void __user *to,
+ const void *from, unsigned long n);
+ unsigned long __must_check copy_from_user(void *to,
+ const void __user *from,
+ unsigned long n);
+ long __must_check strncpy_from_user(char *dst, const char __user *src,
+ long count);
+ long __must_check __strncpy_from_user(char *dst,
+ const char __user *src, long count);
+
+ /**
+ * strlen_user: - Get the size of a string in user space.
+ * @str: The string to measure.
+ *
+ * Context: User context only. This function may sleep.
+ *
+ * Get the size of a NUL-terminated string in user space.
+ *
+ * Returns the size of the string INCLUDING the terminating NUL.
+ * On exception, returns 0.
+ *
+ * If there is a limit on the length of a valid string, you may wish to
+ * consider using strnlen_user() instead.
+ */
+ #define strlen_user(str) strnlen_user(str, LONG_MAX)
+
+ long strnlen_user(const char __user *str, long n);
+ unsigned long __must_check clear_user(void __user *mem, unsigned long len);
+ unsigned long __must_check __clear_user(void __user *mem, unsigned long len);
+
+ #endif /* _ASM_X86_UACCESS_32_H */
--- /dev/null
+ #ifndef _ASM_X86_UACCESS_64_H
+ #define _ASM_X86_UACCESS_64_H
+
+ /*
+ * User space memory access functions
+ */
+ #include <linux/compiler.h>
+ #include <linux/errno.h>
+ #include <linux/prefetch.h>
+ #include <linux/lockdep.h>
+ #include <asm/page.h>
+
+ /*
+ * Copy To/From Userspace
+ */
+
+ /* Handles exceptions in both to and from, but doesn't do access_ok */
+ __must_check unsigned long
+ copy_user_generic(void *to, const void *from, unsigned len);
+
+ __must_check unsigned long
+ copy_to_user(void __user *to, const void *from, unsigned len);
+ __must_check unsigned long
+ copy_from_user(void *to, const void __user *from, unsigned len);
+ __must_check unsigned long
+ copy_in_user(void __user *to, const void __user *from, unsigned len);
+
+ static __always_inline __must_check
+ int __copy_from_user(void *dst, const void __user *src, unsigned size)
+ {
+ int ret = 0;
++
++ might_fault();
+ if (!__builtin_constant_p(size))
+ return copy_user_generic(dst, (__force void *)src, size);
+ switch (size) {
+ case 1:__get_user_asm(*(u8 *)dst, (u8 __user *)src,
+ ret, "b", "b", "=q", 1);
+ return ret;
+ case 2:__get_user_asm(*(u16 *)dst, (u16 __user *)src,
+ ret, "w", "w", "=r", 2);
+ return ret;
+ case 4:__get_user_asm(*(u32 *)dst, (u32 __user *)src,
+ ret, "l", "k", "=r", 4);
+ return ret;
+ case 8:__get_user_asm(*(u64 *)dst, (u64 __user *)src,
+ ret, "q", "", "=r", 8);
+ return ret;
+ case 10:
+ __get_user_asm(*(u64 *)dst, (u64 __user *)src,
+ ret, "q", "", "=r", 16);
+ if (unlikely(ret))
+ return ret;
+ __get_user_asm(*(u16 *)(8 + (char *)dst),
+ (u16 __user *)(8 + (char __user *)src),
+ ret, "w", "w", "=r", 2);
+ return ret;
+ case 16:
+ __get_user_asm(*(u64 *)dst, (u64 __user *)src,
+ ret, "q", "", "=r", 16);
+ if (unlikely(ret))
+ return ret;
+ __get_user_asm(*(u64 *)(8 + (char *)dst),
+ (u64 __user *)(8 + (char __user *)src),
+ ret, "q", "", "=r", 8);
+ return ret;
+ default:
+ return copy_user_generic(dst, (__force void *)src, size);
+ }
+ }
+
+ static __always_inline __must_check
+ int __copy_to_user(void __user *dst, const void *src, unsigned size)
+ {
+ int ret = 0;
++
++ might_fault();
+ if (!__builtin_constant_p(size))
+ return copy_user_generic((__force void *)dst, src, size);
+ switch (size) {
+ case 1:__put_user_asm(*(u8 *)src, (u8 __user *)dst,
+ ret, "b", "b", "iq", 1);
+ return ret;
+ case 2:__put_user_asm(*(u16 *)src, (u16 __user *)dst,
+ ret, "w", "w", "ir", 2);
+ return ret;
+ case 4:__put_user_asm(*(u32 *)src, (u32 __user *)dst,
+ ret, "l", "k", "ir", 4);
+ return ret;
+ case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst,
+ ret, "q", "", "ir", 8);
+ return ret;
+ case 10:
+ __put_user_asm(*(u64 *)src, (u64 __user *)dst,
+ ret, "q", "", "ir", 10);
+ if (unlikely(ret))
+ return ret;
+ asm("":::"memory");
+ __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst,
+ ret, "w", "w", "ir", 2);
+ return ret;
+ case 16:
+ __put_user_asm(*(u64 *)src, (u64 __user *)dst,
+ ret, "q", "", "ir", 16);
+ if (unlikely(ret))
+ return ret;
+ asm("":::"memory");
+ __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
+ ret, "q", "", "ir", 8);
+ return ret;
+ default:
+ return copy_user_generic((__force void *)dst, src, size);
+ }
+ }
+
+ static __always_inline __must_check
+ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
+ {
+ int ret = 0;
++
++ might_fault();
+ if (!__builtin_constant_p(size))
+ return copy_user_generic((__force void *)dst,
+ (__force void *)src, size);
+ switch (size) {
+ case 1: {
+ u8 tmp;
+ __get_user_asm(tmp, (u8 __user *)src,
+ ret, "b", "b", "=q", 1);
+ if (likely(!ret))
+ __put_user_asm(tmp, (u8 __user *)dst,
+ ret, "b", "b", "iq", 1);
+ return ret;
+ }
+ case 2: {
+ u16 tmp;
+ __get_user_asm(tmp, (u16 __user *)src,
+ ret, "w", "w", "=r", 2);
+ if (likely(!ret))
+ __put_user_asm(tmp, (u16 __user *)dst,
+ ret, "w", "w", "ir", 2);
+ return ret;
+ }
+
+ case 4: {
+ u32 tmp;
+ __get_user_asm(tmp, (u32 __user *)src,
+ ret, "l", "k", "=r", 4);
+ if (likely(!ret))
+ __put_user_asm(tmp, (u32 __user *)dst,
+ ret, "l", "k", "ir", 4);
+ return ret;
+ }
+ case 8: {
+ u64 tmp;
+ __get_user_asm(tmp, (u64 __user *)src,
+ ret, "q", "", "=r", 8);
+ if (likely(!ret))
+ __put_user_asm(tmp, (u64 __user *)dst,
+ ret, "q", "", "ir", 8);
+ return ret;
+ }
+ default:
+ return copy_user_generic((__force void *)dst,
+ (__force void *)src, size);
+ }
+ }
+
+ __must_check long
+ strncpy_from_user(char *dst, const char __user *src, long count);
+ __must_check long
+ __strncpy_from_user(char *dst, const char __user *src, long count);
+ __must_check long strnlen_user(const char __user *str, long n);
+ __must_check long __strnlen_user(const char __user *str, long n);
+ __must_check long strlen_user(const char __user *str);
+ __must_check unsigned long clear_user(void __user *mem, unsigned long len);
+ __must_check unsigned long __clear_user(void __user *mem, unsigned long len);
+
+ __must_check long __copy_from_user_inatomic(void *dst, const void __user *src,
+ unsigned size);
+
+ static __must_check __always_inline int
+ __copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
+ {
+ return copy_user_generic((__force void *)dst, src, size);
+ }
+
+ extern long __copy_user_nocache(void *dst, const void __user *src,
+ unsigned size, int zerorest);
+
+ static inline int __copy_from_user_nocache(void *dst, const void __user *src,
+ unsigned size)
+ {
+ might_sleep();
+ return __copy_user_nocache(dst, src, size, 1);
+ }
+
+ static inline int __copy_from_user_inatomic_nocache(void *dst,
+ const void __user *src,
+ unsigned size)
+ {
+ return __copy_user_nocache(dst, src, size, 0);
+ }
+
+ unsigned long
+ copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest);
+
+ #endif /* _ASM_X86_UACCESS_64_H */
#include <asm/uaccess.h>
#include <asm/mmx.h>
+ #ifdef CONFIG_X86_INTEL_USERCOPY
+ /*
+ * Alignment at which movsl is preferred for bulk memory copies.
+ */
+ struct movsl_mask movsl_mask __read_mostly;
+ #endif
+
static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned long n)
{
#ifdef CONFIG_X86_INTEL_USERCOPY
#define __do_strncpy_from_user(dst, src, count, res) \
do { \
int __d0, __d1, __d2; \
- might_sleep(); \
+ might_fault(); \
__asm__ __volatile__( \
" testl %1,%1\n" \
" jz 2f\n" \
#define __do_clear_user(addr,size) \
do { \
int __d0; \
- might_sleep(); \
+ might_fault(); \
__asm__ __volatile__( \
"0: rep; stosl\n" \
" movl %2,%0\n" \
unsigned long
clear_user(void __user *to, unsigned long n)
{
- might_sleep();
+ might_fault();
if (access_ok(VERIFY_WRITE, to, n))
__do_clear_user(to, n);
return n;
unsigned long mask = -__addr_ok(s);
unsigned long res, tmp;
- might_sleep();
+ might_fault();
__asm__ __volatile__(
" testl %0, %0\n"
#include <linux/log2.h>
#include <linux/typecheck.h>
#include <linux/ratelimit.h>
+ #include <linux/dynamic_printk.h>
#include <asm/byteorder.h>
#include <asm/bug.h>
(__x < 0) ? -__x : __x; \
})
+#ifdef CONFIG_PROVE_LOCKING
+void might_fault(void);
+#else
+static inline void might_fault(void)
+{
+ might_sleep();
+}
+#endif
+
extern struct atomic_notifier_head panic_notifier_list;
extern long (*panic_blink)(long time);
NORET_TYPE void panic(const char * fmt, ...)
extern int get_option(char **str, int *pint);
extern char *get_options(const char *str, int nints, int *ints);
- extern unsigned long long memparse(char *ptr, char **retptr);
+ extern unsigned long long memparse(const char *ptr, char **retptr);
extern int core_kernel_text(unsigned long addr);
extern int __kernel_text_address(unsigned long addr);
struct pid;
extern struct pid *session_of_pgrp(struct pid *pgrp);
+ /*
+ * FW_BUG
+ * Add this to a message where you are sure the firmware is buggy or behaves
+ * really stupid or out of spec. Be aware that the responsible BIOS developer
+ * should be able to fix this issue or at least get a concrete idea of the
+ * problem by reading your message without the need of looking at the kernel
+ * code.
+ *
+ * Use it for definite and high priority BIOS bugs.
+ *
+ * FW_WARN
+ * Use it for not that clear (e.g. could the kernel messed up things already?)
+ * and medium priority BIOS bugs.
+ *
+ * FW_INFO
+ * Use this one if you want to tell the user or vendor about something
+ * suspicious, but generally harmless related to the firmware.
+ *
+ * Use it for information or very low priority BIOS bugs.
+ */
+ #define FW_BUG "[Firmware Bug]: "
+ #define FW_WARN "[Firmware Warn]: "
+ #define FW_INFO "[Firmware Info]: "
+
#ifdef CONFIG_PRINTK
asmlinkage int vprintk(const char *fmt, va_list args)
__attribute__ ((format (printf, 1, 0)));
{ return false; }
#endif
+ extern int printk_needs_cpu(int cpu);
+ extern void printk_tick(void);
+
extern void asmlinkage __attribute__((format(printf, 1, 2)))
early_printk(const char *fmt, ...);
extern int panic_timeout;
extern int panic_on_oops;
extern int panic_on_unrecovered_nmi;
- extern int tainted;
extern const char *print_tainted(void);
- extern void add_taint(unsigned);
+ extern void add_taint(unsigned flag);
+ extern int test_taint(unsigned flag);
+ extern unsigned long get_taint(void);
extern int root_mountflags;
/* Values used for system_state */
SYSTEM_SUSPEND_DISK,
} system_state;
- #define TAINT_PROPRIETARY_MODULE (1<<0)
- #define TAINT_FORCED_MODULE (1<<1)
- #define TAINT_UNSAFE_SMP (1<<2)
- #define TAINT_FORCED_RMMOD (1<<3)
- #define TAINT_MACHINE_CHECK (1<<4)
- #define TAINT_BAD_PAGE (1<<5)
- #define TAINT_USER (1<<6)
- #define TAINT_DIE (1<<7)
- #define TAINT_OVERRIDDEN_ACPI_TABLE (1<<8)
- #define TAINT_WARN (1<<9)
+ #define TAINT_PROPRIETARY_MODULE 0
+ #define TAINT_FORCED_MODULE 1
+ #define TAINT_UNSAFE_SMP 2
+ #define TAINT_FORCED_RMMOD 3
+ #define TAINT_MACHINE_CHECK 4
+ #define TAINT_BAD_PAGE 5
+ #define TAINT_USER 6
+ #define TAINT_DIE 7
+ #define TAINT_OVERRIDDEN_ACPI_TABLE 8
+ #define TAINT_WARN 9
+ #define TAINT_CRAP 10
extern void dump_stack(void) __cold;
#define pr_info(fmt, arg...) \
printk(KERN_INFO fmt, ##arg)
- #ifdef DEBUG
/* If you are writing a driver, please use dev_dbg instead */
+ #if defined(CONFIG_DYNAMIC_PRINTK_DEBUG)
+ #define pr_debug(fmt, ...) do { \
+ dynamic_pr_debug(fmt, ##__VA_ARGS__); \
+ } while (0)
+ #elif defined(DEBUG)
#define pr_debug(fmt, arg...) \
printk(KERN_DEBUG fmt, ##arg)
#else
#define NUMA_BUILD 0
#endif
+ /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
+ #ifdef CONFIG_FTRACE_MCOUNT_RECORD
+ # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD
+ #endif
+
#endif
#include <linux/cpuset.h>
#include <linux/percpu.h>
#include <linux/kthread.h>
+ #include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/sysctl.h>
#include <linux/syscalls.h>
#include <linux/debugfs.h>
#include <linux/ctype.h>
#include <linux/ftrace.h>
+ #include <trace/sched.h>
#include <asm/tlb.h>
#include <asm/irq_regs.h>
hrtimer_init(&rt_b->rt_period_timer,
CLOCK_MONOTONIC, HRTIMER_MODE_REL);
rt_b->rt_period_timer.function = sched_rt_period_timer;
- rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+ rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
+ }
+
+ static inline int rt_bandwidth_enabled(void)
+ {
+ return sysctl_sched_rt_runtime >= 0;
}
static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
{
ktime_t now;
- if (rt_b->rt_runtime == RUNTIME_INF)
+ if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
return;
if (hrtimer_active(&rt_b->rt_period_timer))
now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
- hrtimer_start(&rt_b->rt_period_timer,
- rt_b->rt_period_timer.expires,
- HRTIMER_MODE_ABS);
+ hrtimer_start_expires(&rt_b->rt_period_timer,
+ HRTIMER_MODE_ABS);
}
spin_unlock(&rt_b->rt_runtime_lock);
}
static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
#endif /* CONFIG_RT_GROUP_SCHED */
- #else /* !CONFIG_FAIR_GROUP_SCHED */
+ #else /* !CONFIG_USER_SCHED */
#define root_task_group init_task_group
- #endif /* CONFIG_FAIR_GROUP_SCHED */
+ #endif /* CONFIG_USER_SCHED */
/* task_group_lock serializes add/remove of task groups and also changes to
* a task group's cpu shares.
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
- static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
+ static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
{
- rq->curr->sched_class->check_preempt_curr(rq, p);
+ rq->curr->sched_class->check_preempt_curr(rq, p, sync);
}
static inline int cpu_of(struct rq *rq)
*/
unsigned int sysctl_sched_shares_ratelimit = 250000;
+ /*
+ * Inject some fuzzyness into changing the per-cpu group shares
+ * this avoids remote rq-locks at the expense of fairness.
+ * default: 4
+ */
+ unsigned int sysctl_sched_shares_thresh = 4;
+
/*
* period over which we measure -rt task cpu usage in us.
* default: 1s
struct hrtimer *timer = &rq->hrtick_timer;
ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
- timer->expires = time;
+ hrtimer_set_expires(timer, time);
if (rq == this_rq()) {
hrtimer_restart(timer);
return NOTIFY_DONE;
}
- static void init_hrtick(void)
+ static __init void init_hrtick(void)
{
hotcpu_notifier(hotplug_hrtick, 0);
}
hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
}
- static void init_hrtick(void)
+ static inline void init_hrtick(void)
{
}
#endif /* CONFIG_SMP */
hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
rq->hrtick_timer.function = hrtick;
- rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+ rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
}
- #else
+ #else /* CONFIG_SCHED_HRTICK */
static inline void hrtick_clear(struct rq *rq)
{
}
static inline void init_hrtick(void)
{
}
- #endif
+ #endif /* CONFIG_SCHED_HRTICK */
/*
* resched_task - mark a task 'to be rescheduled now'.
update_load_sub(&rq->load, load);
}
- #ifdef CONFIG_SMP
- static unsigned long source_load(int cpu, int type);
- static unsigned long target_load(int cpu, int type);
- static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
-
- static unsigned long cpu_avg_load_per_task(int cpu)
- {
- struct rq *rq = cpu_rq(cpu);
-
- if (rq->nr_running)
- rq->avg_load_per_task = rq->load.weight / rq->nr_running;
-
- return rq->avg_load_per_task;
- }
-
- #ifdef CONFIG_FAIR_GROUP_SCHED
-
- typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
+ #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
+ typedef int (*tg_visitor)(struct task_group *, void *);
/*
* Iterate the full tree, calling @down when first entering a node and @up when
* leaving it for the final time.
*/
- static void
- walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
+ static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
{
struct task_group *parent, *child;
+ int ret;
rcu_read_lock();
parent = &root_task_group;
down:
- (*down)(parent, cpu, sd);
+ ret = (*down)(parent, data);
+ if (ret)
+ goto out_unlock;
list_for_each_entry_rcu(child, &parent->children, siblings) {
parent = child;
goto down;
up:
continue;
}
- (*up)(parent, cpu, sd);
+ ret = (*up)(parent, data);
+ if (ret)
+ goto out_unlock;
child = parent;
parent = parent->parent;
if (parent)
goto up;
+ out_unlock:
rcu_read_unlock();
+
+ return ret;
+ }
+
+ static int tg_nop(struct task_group *tg, void *data)
+ {
+ return 0;
+ }
+ #endif
+
+ #ifdef CONFIG_SMP
+ static unsigned long source_load(int cpu, int type);
+ static unsigned long target_load(int cpu, int type);
+ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
+
+ static unsigned long cpu_avg_load_per_task(int cpu)
+ {
+ struct rq *rq = cpu_rq(cpu);
+
+ if (rq->nr_running)
+ rq->avg_load_per_task = rq->load.weight / rq->nr_running;
+
+ return rq->avg_load_per_task;
}
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+
static void __set_se_shares(struct sched_entity *se, unsigned long shares);
/*
* Calculate and set the cpu's group shares.
*/
static void
- __update_group_shares_cpu(struct task_group *tg, int cpu,
- unsigned long sd_shares, unsigned long sd_rq_weight)
+ update_group_shares_cpu(struct task_group *tg, int cpu,
+ unsigned long sd_shares, unsigned long sd_rq_weight)
{
int boost = 0;
unsigned long shares;
*
*/
shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+ shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
- /*
- * record the actual number of shares, not the boosted amount.
- */
- tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
- tg->cfs_rq[cpu]->rq_weight = rq_weight;
+ if (abs(shares - tg->se[cpu]->load.weight) >
+ sysctl_sched_shares_thresh) {
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
- if (shares < MIN_SHARES)
- shares = MIN_SHARES;
- else if (shares > MAX_SHARES)
- shares = MAX_SHARES;
+ spin_lock_irqsave(&rq->lock, flags);
+ /*
+ * record the actual number of shares, not the boosted amount.
+ */
+ tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
+ tg->cfs_rq[cpu]->rq_weight = rq_weight;
- __set_se_shares(tg->se[cpu], shares);
+ __set_se_shares(tg->se[cpu], shares);
+ spin_unlock_irqrestore(&rq->lock, flags);
+ }
}
/*
* This needs to be done in a bottom-up fashion because the rq weight of a
* parent group depends on the shares of its child groups.
*/
- static void
- tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
+ static int tg_shares_up(struct task_group *tg, void *data)
{
unsigned long rq_weight = 0;
unsigned long shares = 0;
+ struct sched_domain *sd = data;
int i;
for_each_cpu_mask(i, sd->span) {
if (!rq_weight)
rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
- for_each_cpu_mask(i, sd->span) {
- struct rq *rq = cpu_rq(i);
- unsigned long flags;
+ for_each_cpu_mask(i, sd->span)
+ update_group_shares_cpu(tg, i, shares, rq_weight);
- spin_lock_irqsave(&rq->lock, flags);
- __update_group_shares_cpu(tg, i, shares, rq_weight);
- spin_unlock_irqrestore(&rq->lock, flags);
- }
+ return 0;
}
/*
* This needs to be done in a top-down fashion because the load of a child
* group is a fraction of its parents load.
*/
- static void
- tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
+ static int tg_load_down(struct task_group *tg, void *data)
{
unsigned long load;
+ long cpu = (long)data;
if (!tg->parent) {
load = cpu_rq(cpu)->load.weight;
}
tg->cfs_rq[cpu]->h_load = load;
- }
- static void
- tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
- {
+ return 0;
}
static void update_shares(struct sched_domain *sd)
if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
sd->last_update = now;
- walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
+ walk_tg_tree(tg_nop, tg_shares_up, sd);
}
}
spin_lock(&rq->lock);
}
- static void update_h_load(int cpu)
+ static void update_h_load(long cpu)
{
- walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
+ walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
}
#else
* just go back and repeat.
*/
rq = task_rq_lock(p, &flags);
+ trace_sched_wait_task(rq, p);
running = task_running(rq, p);
on_rq = p->se.on_rq;
ncsw = 0;
- if (!match_state || p->state == match_state) {
- ncsw = p->nivcsw + p->nvcsw;
- if (unlikely(!ncsw))
- ncsw = 1;
- }
+ if (!match_state || p->state == match_state)
+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
task_rq_unlock(rq, &flags);
/*
success = 1;
out_running:
- trace_mark(kernel_sched_wakeup,
- "pid %d state %ld ## rq %p task %p rq->curr %p",
- p->pid, p->state, rq, p, rq->curr);
- check_preempt_curr(rq, p);
+ trace_sched_wakeup(rq, p);
+ check_preempt_curr(rq, p, sync);
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
p->sched_class->task_new(rq, p);
inc_nr_running(rq);
}
- trace_mark(kernel_sched_wakeup_new,
- "pid %d state %ld ## rq %p task %p rq->curr %p",
- p->pid, p->state, rq, p, rq->curr);
- check_preempt_curr(rq, p);
+ trace_sched_wakeup_new(rq, p);
+ check_preempt_curr(rq, p, 0);
#ifdef CONFIG_SMP
if (p->sched_class->task_wake_up)
p->sched_class->task_wake_up(rq, p);
struct mm_struct *mm, *oldmm;
prepare_task_switch(rq, prev, next);
- trace_mark(kernel_sched_schedule,
- "prev_pid %d next_pid %d prev_state %ld "
- "## rq %p prev %p next %p",
- prev->pid, next->pid, prev->state,
- rq, prev, next);
+ trace_sched_switch(rq, prev, next);
mm = next->mm;
oldmm = prev->active_mm;
/*
|| unlikely(!cpu_active(dest_cpu)))
goto out;
+ trace_sched_migrate_task(rq, p, dest_cpu);
/* force the process onto the specified CPU */
if (migrate_task(p, dest_cpu, &req)) {
/* Need to wait for migration thread (might exit: take ref). */
* Note that idle threads have a prio of MAX_PRIO, for this test
* to be always true for them.
*/
- check_preempt_curr(this_rq, p);
+ check_preempt_curr(this_rq, p, 0);
}
/*
EXPORT_PER_CPU_SYMBOL(kstat);
/*
- * Return p->sum_exec_runtime plus any more ns on the sched_clock
- * that have not yet been banked in case the task is currently running.
+ * Return any ns on the sched_clock that have not yet been banked in
+ * @p in case that task is currently running.
*/
- unsigned long long task_sched_runtime(struct task_struct *p)
+ unsigned long long task_delta_exec(struct task_struct *p)
{
unsigned long flags;
- u64 ns, delta_exec;
struct rq *rq;
+ u64 ns = 0;
rq = task_rq_lock(p, &flags);
- ns = p->se.sum_exec_runtime;
+
if (task_current(rq, p)) {
+ u64 delta_exec;
+
update_rq_clock(rq);
delta_exec = rq->clock - p->se.exec_start;
if ((s64)delta_exec > 0)
- ns += delta_exec;
+ ns = delta_exec;
}
+
task_rq_unlock(rq, &flags);
return ns;
cputime64_t tmp;
p->utime = cputime_add(p->utime, cputime);
+ account_group_user_time(p, cputime);
/* Add user time to cpustat. */
tmp = cputime_to_cputime64(cputime);
tmp = cputime_to_cputime64(cputime);
p->utime = cputime_add(p->utime, cputime);
+ account_group_user_time(p, cputime);
p->gtime = cputime_add(p->gtime, cputime);
cpustat->user = cputime64_add(cpustat->user, tmp);
}
p->stime = cputime_add(p->stime, cputime);
+ account_group_system_time(p, cputime);
/* Add system time to cpustat. */
tmp = cputime_to_cputime64(cputime);
if (p == rq->idle) {
p->stime = cputime_add(p->stime, steal);
+ account_group_system_time(p, steal);
if (atomic_read(&rq->nr_iowait) > 0)
cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
else
/*
* Underflow?
*/
- if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
return;
/*
* Is the spinlock portion underflowing?
if (sched_feat(HRTICK))
hrtick_clear(rq);
- /*
- * Do the rq-clock update outside the rq lock:
- */
- local_irq_disable();
+ spin_lock_irq(&rq->lock);
update_rq_clock(rq);
- spin_lock(&rq->lock);
clear_tsk_need_resched(prev);
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
}
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
+ /**
+ * complete: - signals a single thread waiting on this completion
+ * @x: holds the state of this particular completion
+ *
+ * This will wake up a single thread waiting on this completion. Threads will be
+ * awakened in the same order in which they were queued.
+ *
+ * See also complete_all(), wait_for_completion() and related routines.
+ */
void complete(struct completion *x)
{
unsigned long flags;
}
EXPORT_SYMBOL(complete);
+ /**
+ * complete_all: - signals all threads waiting on this completion
+ * @x: holds the state of this particular completion
+ *
+ * This will wake up all threads waiting on this particular completion event.
+ */
void complete_all(struct completion *x)
{
unsigned long flags;
wait.flags |= WQ_FLAG_EXCLUSIVE;
__add_wait_queue_tail(&x->wait, &wait);
do {
- if ((state == TASK_INTERRUPTIBLE &&
- signal_pending(current)) ||
- (state == TASK_KILLABLE &&
- fatal_signal_pending(current))) {
+ if (signal_pending_state(state, current)) {
timeout = -ERESTARTSYS;
break;
}
return timeout;
}
+ /**
+ * wait_for_completion: - waits for completion of a task
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout.
+ *
+ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
+ * and interrupt capability. Also see complete().
+ */
void __sched wait_for_completion(struct completion *x)
{
wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_for_completion);
+ /**
+ * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible.
+ */
unsigned long __sched
wait_for_completion_timeout(struct completion *x, unsigned long timeout)
{
}
EXPORT_SYMBOL(wait_for_completion_timeout);
+ /**
+ * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
+ * @x: holds the state of this particular completion
+ *
+ * This waits for completion of a specific task to be signaled. It is
+ * interruptible.
+ */
int __sched wait_for_completion_interruptible(struct completion *x)
{
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_for_completion_interruptible);
+ /**
+ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. It is interruptible. The timeout is in jiffies.
+ */
unsigned long __sched
wait_for_completion_interruptible_timeout(struct completion *x,
unsigned long timeout)
}
EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
+ /**
+ * wait_for_completion_killable: - waits for completion of a task (killable)
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It can be
+ * interrupted by a kill signal.
+ */
int __sched wait_for_completion_killable(struct completion *x)
{
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
* Do not allow realtime tasks into groups that have no runtime
* assigned.
*/
- if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
+ if (rt_bandwidth_enabled() && rt_policy(policy) &&
+ task_group(p)->rt_bandwidth.rt_runtime == 0)
return -EPERM;
#endif
set_task_cpu(p, dest_cpu);
if (on_rq) {
activate_task(rq_dest, p, 0);
- check_preempt_curr(rq_dest, p);
+ check_preempt_curr(rq_dest, p, 0);
}
done:
ret = 1;
static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
- struct ctl_table *table = sd_alloc_ctl_entry(12);
+ struct ctl_table *table = sd_alloc_ctl_entry(13);
if (table == NULL)
return NULL;
sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[10], "flags", &sd->flags,
sizeof(int), 0644, proc_dointvec_minmax);
- /* &table[11] is terminator */
+ set_table_entry(&table[11], "name", sd->name,
+ CORENAME_MAX_SIZE, 0444, proc_dostring);
+ /* &table[12] is terminator */
return table;
}
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
*/
+ #ifdef CONFIG_SCHED_DEBUG
+ # define SD_INIT_NAME(sd, type) sd->name = #type
+ #else
+ # define SD_INIT_NAME(sd, type) do { } while (0)
+ #endif
+
#define SD_INIT(sd, type) sd_init_##type(sd)
+
#define SD_INIT_FUNC(type) \
static noinline void sd_init_##type(struct sched_domain *sd) \
{ \
memset(sd, 0, sizeof(*sd)); \
*sd = SD_##type##_INIT; \
sd->level = SD_LV_##type; \
+ SD_INIT_NAME(sd, type); \
}
SD_INIT_FUNC(CPU)
#ifdef in_atomic
static unsigned long prev_jiffy; /* ratelimiting */
- if ((in_atomic() || irqs_disabled()) &&
- system_state == SYSTEM_RUNNING && !oops_in_progress) {
- if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
- return;
- prev_jiffy = jiffies;
- printk(KERN_ERR "BUG: sleeping function called from invalid"
- " context at %s:%d\n", file, line);
- printk("in_atomic():%d, irqs_disabled():%d\n",
- in_atomic(), irqs_disabled());
- debug_show_held_locks(current);
- if (irqs_disabled())
- print_irqtrace_events(current);
- dump_stack();
- }
+ if ((!in_atomic() && !irqs_disabled()) ||
+ system_state != SYSTEM_RUNNING || oops_in_progress)
+ return;
+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+ return;
+ prev_jiffy = jiffies;
+
+ printk(KERN_ERR
+ "BUG: sleeping function called from invalid context at %s:%d\n",
+ file, line);
+ printk(KERN_ERR
+ "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(),
+ current->pid, current->comm);
+
+ debug_show_held_locks(current);
+ if (irqs_disabled())
+ print_irqtrace_events(current);
+ dump_stack();
#endif
}
EXPORT_SYMBOL(__might_sleep);
static unsigned long to_ratio(u64 period, u64 runtime)
{
if (runtime == RUNTIME_INF)
- return 1ULL << 16;
+ return 1ULL << 20;
- return div64_u64(runtime << 16, period);
+ return div64_u64(runtime << 20, period);
}
- #ifdef CONFIG_CGROUP_SCHED
- static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
+ /* Must be called with tasklist_lock held */
+ static inline int tg_has_rt_tasks(struct task_group *tg)
{
- struct task_group *tgi, *parent = tg->parent;
- unsigned long total = 0;
+ struct task_struct *g, *p;
- if (!parent) {
- if (global_rt_period() < period)
- return 0;
+ do_each_thread(g, p) {
+ if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
+ return 1;
+ } while_each_thread(g, p);
- return to_ratio(period, runtime) <
- to_ratio(global_rt_period(), global_rt_runtime());
- }
+ return 0;
+ }
- if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
- return 0;
+ struct rt_schedulable_data {
+ struct task_group *tg;
+ u64 rt_period;
+ u64 rt_runtime;
+ };
- rcu_read_lock();
- list_for_each_entry_rcu(tgi, &parent->children, siblings) {
- if (tgi == tg)
- continue;
+ static int tg_schedulable(struct task_group *tg, void *data)
+ {
+ struct rt_schedulable_data *d = data;
+ struct task_group *child;
+ unsigned long total, sum = 0;
+ u64 period, runtime;
- total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
- tgi->rt_bandwidth.rt_runtime);
+ period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+ runtime = tg->rt_bandwidth.rt_runtime;
+
+ if (tg == d->tg) {
+ period = d->rt_period;
+ runtime = d->rt_runtime;
}
- rcu_read_unlock();
- return total + to_ratio(period, runtime) <=
- to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
- parent->rt_bandwidth.rt_runtime);
- }
- #elif defined CONFIG_USER_SCHED
- static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
- {
- struct task_group *tgi;
- unsigned long total = 0;
- unsigned long global_ratio =
- to_ratio(global_rt_period(), global_rt_runtime());
+ /*
+ * Cannot have more runtime than the period.
+ */
+ if (runtime > period && runtime != RUNTIME_INF)
+ return -EINVAL;
- rcu_read_lock();
- list_for_each_entry_rcu(tgi, &task_groups, list) {
- if (tgi == tg)
- continue;
+ /*
+ * Ensure we don't starve existing RT tasks.
+ */
+ if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
+ return -EBUSY;
- total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
- tgi->rt_bandwidth.rt_runtime);
+ total = to_ratio(period, runtime);
+
+ /*
+ * Nobody can have more than the global setting allows.
+ */
+ if (total > to_ratio(global_rt_period(), global_rt_runtime()))
+ return -EINVAL;
+
+ /*
+ * The sum of our children's runtime should not exceed our own.
+ */
+ list_for_each_entry_rcu(child, &tg->children, siblings) {
+ period = ktime_to_ns(child->rt_bandwidth.rt_period);
+ runtime = child->rt_bandwidth.rt_runtime;
+
+ if (child == d->tg) {
+ period = d->rt_period;
+ runtime = d->rt_runtime;
+ }
+
+ sum += to_ratio(period, runtime);
}
- rcu_read_unlock();
- return total + to_ratio(period, runtime) < global_ratio;
+ if (sum > total)
+ return -EINVAL;
+
+ return 0;
}
- #endif
- /* Must be called with tasklist_lock held */
- static inline int tg_has_rt_tasks(struct task_group *tg)
+ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
{
- struct task_struct *g, *p;
- do_each_thread(g, p) {
- if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
- return 1;
- } while_each_thread(g, p);
- return 0;
+ struct rt_schedulable_data data = {
+ .tg = tg,
+ .rt_period = period,
+ .rt_runtime = runtime,
+ };
+
+ return walk_tg_tree(tg_schedulable, tg_nop, &data);
}
static int tg_set_bandwidth(struct task_group *tg,
mutex_lock(&rt_constraints_mutex);
read_lock(&tasklist_lock);
- if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
- err = -EBUSY;
- goto unlock;
- }
- if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
- err = -EINVAL;
+ err = __rt_schedulable(tg, rt_period, rt_runtime);
+ if (err)
goto unlock;
- }
spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
static int sched_rt_global_constraints(void)
{
- struct task_group *tg = &root_task_group;
- u64 rt_runtime, rt_period;
+ u64 runtime, period;
int ret = 0;
if (sysctl_sched_rt_period <= 0)
return -EINVAL;
- rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
- rt_runtime = tg->rt_bandwidth.rt_runtime;
+ runtime = global_rt_runtime();
+ period = global_rt_period();
+
+ /*
+ * Sanity check on the sysctl variables.
+ */
+ if (runtime > period && runtime != RUNTIME_INF)
+ return -EINVAL;
mutex_lock(&rt_constraints_mutex);
- if (!__rt_schedulable(tg, rt_period, rt_runtime))
- ret = -EINVAL;
+ read_lock(&tasklist_lock);
+ ret = __rt_schedulable(NULL, 0, 0);
+ read_unlock(&tasklist_lock);
mutex_unlock(&rt_constraints_mutex);
return ret;
if (!cgrp->parent) {
/* This is early initialization for the top cgroup */
- init_task_group.css.cgroup = cgrp;
return &init_task_group.css;
}
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);
- /* Bind the cgroup to task_group object we just created */
- tg->css.cgroup = cgrp;
-
return &tg->css;
}
return !vma->vm_ops || !vma->vm_ops->fault;
}
- int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, int len, int write, int force,
+
+
+ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, int len, int flags,
struct page **pages, struct vm_area_struct **vmas)
{
int i;
- unsigned int vm_flags;
+ unsigned int vm_flags = 0;
+ int write = !!(flags & GUP_FLAGS_WRITE);
+ int force = !!(flags & GUP_FLAGS_FORCE);
+ int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
if (len <= 0)
return 0;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
- if (write) /* user gate pages are read-only */
+
+ /* user gate pages are read-only */
+ if (!ignore && write)
return i ? : -EFAULT;
if (pg > TASK_SIZE)
pgd = pgd_offset_k(pg);
continue;
}
- if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
- || !(vm_flags & vma->vm_flags))
+ if (!vma ||
+ (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
+ (!ignore && !(vm_flags & vma->vm_flags)))
return i ? : -EFAULT;
if (is_vm_hugetlb_page(vma)) {
} while (len);
return i;
}
+
+ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, int len, int write, int force,
+ struct page **pages, struct vm_area_struct **vmas)
+ {
+ int flags = 0;
+
+ if (write)
+ flags |= GUP_FLAGS_WRITE;
+ if (force)
+ flags |= GUP_FLAGS_FORCE;
+
+ return __get_user_pages(tsk, mm,
+ start, len, flags,
+ pages, vmas);
+ }
+
EXPORT_SYMBOL(get_user_pages);
pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
pte_t *pte;
spinlock_t *ptl;
- retval = mem_cgroup_charge(page, mm, GFP_KERNEL);
- if (retval)
- goto out;
-
retval = -EINVAL;
if (PageAnon(page))
- goto out_uncharge;
+ goto out;
retval = -ENOMEM;
flush_dcache_page(page);
pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
- goto out_uncharge;
+ goto out;
retval = -EBUSY;
if (!pte_none(*pte))
goto out_unlock;
return retval;
out_unlock:
pte_unmap_unlock(pte, ptl);
- out_uncharge:
- mem_cgroup_uncharge_page(page);
out:
return retval;
}
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
if (!new_page)
goto oom;
+ /*
+ * Don't let another task, with possibly unlocked vma,
+ * keep the mlocked page.
+ */
+ if (vma->vm_flags & VM_LOCKED) {
+ lock_page(old_page); /* for LRU manipulation */
+ clear_page_mlock(old_page);
+ unlock_page(old_page);
+ }
cow_user_page(new_page, old_page, address, vma);
__SetPageUptodate(new_page);
* thread doing COW.
*/
ptep_clear_flush_notify(vma, address, page_table);
- set_pte_at(mm, address, page_table, entry);
- update_mmu_cache(vma, address, entry);
- lru_cache_add_active(new_page);
+ SetPageSwapBacked(new_page);
+ lru_cache_add_active_or_unevictable(new_page, vma);
page_add_new_anon_rmap(new_page, vma, address);
+ //TODO: is this safe? do_anonymous_page() does it this way.
+ set_pte_at(mm, address, page_table, entry);
+ update_mmu_cache(vma, address, entry);
if (old_page) {
/*
* Only after switching the pte to the new page may
count_vm_event(PGMAJFAULT);
}
+ mark_page_accessed(page);
+
+ lock_page(page);
+ delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+
if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
- delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
ret = VM_FAULT_OOM;
+ unlock_page(page);
goto out;
}
- mark_page_accessed(page);
- lock_page(page);
- delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
-
/*
* Back out if somebody else already faulted in this pte.
*/
page_add_anon_rmap(page, vma, address);
swap_free(entry);
- if (vm_swap_full())
+ if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
remove_exclusive_swap_page(page);
unlock_page(page);
if (!pte_none(*page_table))
goto release;
inc_mm_counter(mm, anon_rss);
- lru_cache_add_active(page);
+ SetPageSwapBacked(page);
+ lru_cache_add_active_or_unevictable(page, vma);
page_add_new_anon_rmap(page, vma, address);
set_pte_at(mm, address, page_table, entry);
struct page *page;
pte_t entry;
int anon = 0;
+ int charged = 0;
struct page *dirty_page = NULL;
struct vm_fault vmf;
int ret;
ret = VM_FAULT_OOM;
goto out;
}
+ if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
+ ret = VM_FAULT_OOM;
+ page_cache_release(page);
+ goto out;
+ }
+ charged = 1;
+ /*
+ * Don't let another task, with possibly unlocked vma,
+ * keep the mlocked page.
+ */
+ if (vma->vm_flags & VM_LOCKED)
+ clear_page_mlock(vmf.page);
copy_user_highpage(page, vmf.page, address, vma);
__SetPageUptodate(page);
} else {
}
- if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
- ret = VM_FAULT_OOM;
- goto out;
- }
-
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
/*
entry = mk_pte(page, vma->vm_page_prot);
if (flags & FAULT_FLAG_WRITE)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- set_pte_at(mm, address, page_table, entry);
if (anon) {
- inc_mm_counter(mm, anon_rss);
- lru_cache_add_active(page);
- page_add_new_anon_rmap(page, vma, address);
+ inc_mm_counter(mm, anon_rss);
+ SetPageSwapBacked(page);
+ lru_cache_add_active_or_unevictable(page, vma);
+ page_add_new_anon_rmap(page, vma, address);
} else {
inc_mm_counter(mm, file_rss);
page_add_file_rmap(page);
get_page(dirty_page);
}
}
+ //TODO: is this safe? do_anonymous_page() does it this way.
+ set_pte_at(mm, address, page_table, entry);
/* no need to invalidate: a not-present page won't be cached */
update_mmu_cache(vma, address, entry);
} else {
- mem_cgroup_uncharge_page(page);
+ if (charged)
+ mem_cgroup_uncharge_page(page);
if (anon)
page_cache_release(page);
else
len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
ret = get_user_pages(current, current->mm, addr,
len, write, 0, NULL, NULL);
- if (ret < 0) {
- /*
- SUS require strange return value to mlock
- - invalid addr generate to ENOMEM.
- - out of memory should generate EAGAIN.
- */
- if (ret == -EFAULT)
- ret = -ENOMEM;
- else if (ret == -ENOMEM)
- ret = -EAGAIN;
+ if (ret < 0)
return ret;
- }
- return ret == len ? 0 : -ENOMEM;
+ return ret == len ? 0 : -EFAULT;
}
#if !defined(__HAVE_ARCH_GATE_AREA)
}
up_read(¤t->mm->mmap_sem);
}
+
+#ifdef CONFIG_PROVE_LOCKING
+void might_fault(void)
+{
+ might_sleep();
+ /*
+ * it would be nicer only to annotate paths which are not under
+ * pagefault_disable, however that requires a larger audit and
+ * providing helpers like get_user_atomic.
+ */
+ if (!in_atomic() && current->mm)
+ might_lock_read(¤t->mm->mmap_sem);
+}
+EXPORT_SYMBOL(might_fault);
+#endif