[mirror_ubuntu-artful-kernel.git] / mm / util.c

#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/task_stack.h>
#include <linux/security.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/mman.h>
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
#include <linux/userfaultfd_k.h>

#include <asm/sections.h>
#include <linux/uaccess.h>

#include "internal.h"

static inline int is_kernel_rodata(unsigned long addr)
{
	return addr >= (unsigned long)__start_rodata &&
		addr < (unsigned long)__end_rodata;
}

/**
 * kfree_const - conditionally free memory
 * @x: pointer to the memory
 *
 * Function calls kfree only if @x is not in .rodata section.
 */
void kfree_const(const void *x)
{
	if (!is_kernel_rodata((unsigned long)x))
		kfree(x);
}
EXPORT_SYMBOL(kfree_const);

/**
 * kstrdup - allocate space for and copy an existing string
 * @s: the string to duplicate
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 */
char *kstrdup(const char *s, gfp_t gfp)
{
	size_t len;
	char *buf;

	if (!s)
		return NULL;

	len = strlen(s) + 1;
	buf = kmalloc_track_caller(len, gfp);
	if (buf)
		memcpy(buf, s, len);
	return buf;
}
EXPORT_SYMBOL(kstrdup);

/**
 * kstrdup_const - conditionally duplicate an existing const string
 * @s: the string to duplicate
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Function returns source string if it is in .rodata section otherwise it
 * fallbacks to kstrdup.
 * Strings allocated by kstrdup_const should be freed by kfree_const.
 */
const char *kstrdup_const(const char *s, gfp_t gfp)
{
	if (is_kernel_rodata((unsigned long)s))
		return s;

	return kstrdup(s, gfp);
}
EXPORT_SYMBOL(kstrdup_const);

/**
 * kstrndup - allocate space for and copy an existing string
 * @s: the string to duplicate
 * @max: read at most @max chars from @s
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Note: Use kmemdup_nul() instead if the size is known exactly.
 */
char *kstrndup(const char *s, size_t max, gfp_t gfp)
{
	size_t len;
	char *buf;

	if (!s)
		return NULL;

	len = strnlen(s, max);
	buf = kmalloc_track_caller(len+1, gfp);
	if (buf) {
		memcpy(buf, s, len);
		buf[len] = '\0';
	}
	return buf;
}
EXPORT_SYMBOL(kstrndup);

/**
 * kmemdup - duplicate region of memory
 *
 * @src: memory region to duplicate
 * @len: memory region length
 * @gfp: GFP mask to use
 */
void *kmemdup(const void *src, size_t len, gfp_t gfp)
{
	void *p;

	p = kmalloc_track_caller(len, gfp);
	if (p)
		memcpy(p, src, len);
	return p;
}
EXPORT_SYMBOL(kmemdup);

/**
 * kmemdup_nul - Create a NUL-terminated string from unterminated data
 * @s: The data to stringify
 * @len: The size of the data
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 */
char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
{
	char *buf;

	if (!s)
		return NULL;

	buf = kmalloc_track_caller(len + 1, gfp);
	if (buf) {
		memcpy(buf, s, len);
		buf[len] = '\0';
	}
	return buf;
}
EXPORT_SYMBOL(kmemdup_nul);

/**
 * memdup_user - duplicate memory region from user space
 *
 * @src: source address in user space
 * @len: number of bytes to copy
 *
 * Returns an ERR_PTR() on failure.
 */
void *memdup_user(const void __user *src, size_t len)
{
	void *p;

	/*
	 * Always use GFP_KERNEL, since copy_from_user() can sleep and
	 * cause pagefault, which makes it pointless to use GFP_NOFS
	 * or GFP_ATOMIC.
	 */
	p = kmalloc_track_caller(len, GFP_KERNEL);
	if (!p)
		return ERR_PTR(-ENOMEM);

	if (copy_from_user(p, src, len)) {
		kfree(p);
		return ERR_PTR(-EFAULT);
	}

	return p;
}
EXPORT_SYMBOL(memdup_user);

/*
 * strndup_user - duplicate an existing string from user space
 * @s: The string to duplicate
 * @n: Maximum number of bytes to copy, including the trailing NUL.
 */
char *strndup_user(const char __user *s, long n)
{
	char *p;
	long length;

	length = strnlen_user(s, n);

	if (!length)
		return ERR_PTR(-EFAULT);

	if (length > n)
		return ERR_PTR(-EINVAL);

	p = memdup_user(s, length);

	if (IS_ERR(p))
		return p;

	p[length - 1] = '\0';

	return p;
}
EXPORT_SYMBOL(strndup_user);

/**
 * memdup_user_nul - duplicate memory region from user space and NUL-terminate
 *
 * @src: source address in user space
 * @len: number of bytes to copy
 *
 * Returns an ERR_PTR() on failure.
 */
void *memdup_user_nul(const void __user *src, size_t len)
{
	char *p;

	/*
	 * Always use GFP_KERNEL, since copy_from_user() can sleep and
	 * cause pagefault, which makes it pointless to use GFP_NOFS
	 * or GFP_ATOMIC.
	 */
	p = kmalloc_track_caller(len + 1, GFP_KERNEL);
	if (!p)
		return ERR_PTR(-ENOMEM);

	if (copy_from_user(p, src, len)) {
		kfree(p);
		return ERR_PTR(-EFAULT);
	}
	p[len] = '\0';

	return p;
}
EXPORT_SYMBOL(memdup_user_nul);

void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
		struct vm_area_struct *prev, struct rb_node *rb_parent)
{
	struct vm_area_struct *next;

	vma->vm_prev = prev;
	if (prev) {
		next = prev->vm_next;
		prev->vm_next = vma;
	} else {
		mm->mmap = vma;
		if (rb_parent)
			next = rb_entry(rb_parent,
					struct vm_area_struct, vm_rb);
		else
			next = NULL;
	}
	vma->vm_next = next;
	if (next)
		next->vm_prev = vma;
}

/* Check if the vma is being used as a stack by this task */
int vma_is_stack_for_current(struct vm_area_struct *vma)
{
	struct task_struct * __maybe_unused t = current;

	return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
}

#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
void arch_pick_mmap_layout(struct mm_struct *mm)
{
	mm->mmap_base = TASK_UNMAPPED_BASE;
	mm->get_unmapped_area = arch_get_unmapped_area;
}
#endif

/*
 * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
 * back to the regular GUP.
 * If the architecture not support this function, simply return with no
 * page pinned
 */
int __weak __get_user_pages_fast(unsigned long start,
				 int nr_pages, int write, struct page **pages)
{
	return 0;
}
EXPORT_SYMBOL_GPL(__get_user_pages_fast);

/**
 * get_user_pages_fast() - pin user pages in memory
 * @start:	starting user address
 * @nr_pages:	number of pages from start to pin
 * @write:	whether pages will be written to
 * @pages:	array that receives pointers to the pages pinned.
 *		Should be at least nr_pages long.
 *
 * Returns number of pages pinned. This may be fewer than the number
 * requested. If nr_pages is 0 or negative, returns 0. If no pages
 * were pinned, returns -errno.
 *
 * get_user_pages_fast provides equivalent functionality to get_user_pages,
 * operating on current and current->mm, with force=0 and vma=NULL. However
 * unlike get_user_pages, it must be called without mmap_sem held.
 *
 * get_user_pages_fast may take mmap_sem and page table locks, so no
 * assumptions can be made about lack of locking. get_user_pages_fast is to be
 * implemented in a way that is advantageous (vs get_user_pages()) when the
 * user memory area is already faulted in and present in ptes. However if the
 * pages have to be faulted in, it may turn out to be slightly slower so
 * callers need to carefully consider what to use. On many architectures,
 * get_user_pages_fast simply falls back to get_user_pages.
 */
int __weak get_user_pages_fast(unsigned long start,
				int nr_pages, int write, struct page **pages)
{
	return get_user_pages_unlocked(start, nr_pages, pages,
				       write ? FOLL_WRITE : 0);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast);

unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
	unsigned long len, unsigned long prot,
	unsigned long flag, unsigned long pgoff)
{
	unsigned long ret;
	struct mm_struct *mm = current->mm;
	unsigned long populate;
	LIST_HEAD(uf);

	ret = security_mmap_file(file, prot, flag);
	if (!ret) {
		if (down_write_killable(&mm->mmap_sem))
			return -EINTR;
		ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
				    &populate, &uf);
		up_write(&mm->mmap_sem);
		userfaultfd_unmap_complete(mm, &uf);
		if (populate)
			mm_populate(ret, populate);
	}
	return ret;
}

unsigned long vm_mmap(struct file *file, unsigned long addr,
	unsigned long len, unsigned long prot,
	unsigned long flag, unsigned long offset)
{
	if (unlikely(offset + PAGE_ALIGN(len) < offset))
		return -EINVAL;
	if (unlikely(offset_in_page(offset)))
		return -EINVAL;

	return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
}
EXPORT_SYMBOL(vm_mmap);

/**
 * kvmalloc_node - attempt to allocate physically contiguous memory, but upon
 * failure, fall back to non-contiguous (vmalloc) allocation.
 * @size: size of the request.
 * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
 * @node: numa node to allocate from
 *
 * Uses kmalloc to get the memory but if the allocation fails then falls back
 * to the vmalloc allocator. Use kvfree for freeing the memory.
 *
 * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported.
 * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
 * preferable to the vmalloc fallback, due to visible performance drawbacks.
 *
 * Any use of gfp flags outside of GFP_KERNEL should be consulted with mm people.
 */
void *kvmalloc_node(size_t size, gfp_t flags, int node)
{
	gfp_t kmalloc_flags = flags;
	void *ret;

	/*
	 * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables)
	 * so the given set of flags has to be compatible.
	 */
	WARN_ON_ONCE((flags & GFP_KERNEL) != GFP_KERNEL);

	/*
	 * We want to attempt a large physically contiguous block first because
	 * it is less likely to fragment multiple larger blocks and therefore
	 * contribute to a long term fragmentation less than vmalloc fallback.
	 * However make sure that larger requests are not too disruptive - no
	 * OOM killer and no allocation failure warnings as we have a fallback.
	 */
	if (size > PAGE_SIZE) {
		kmalloc_flags |= __GFP_NOWARN;

		if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL))
			kmalloc_flags |= __GFP_NORETRY;
	}

	ret = kmalloc_node(size, kmalloc_flags, node);

	/*
	 * It doesn't really make sense to fallback to vmalloc for sub page
	 * requests
	 */
	if (ret || size <= PAGE_SIZE)
		return ret;

	return __vmalloc_node_flags_caller(size, node, flags,
			__builtin_return_address(0));
}
EXPORT_SYMBOL(kvmalloc_node);

void kvfree(const void *addr)
{
	if (is_vmalloc_addr(addr))
		vfree(addr);
	else
		kfree(addr);
}
EXPORT_SYMBOL(kvfree);

static inline void *__page_rmapping(struct page *page)
{
	unsigned long mapping;

	mapping = (unsigned long)page->mapping;
	mapping &= ~PAGE_MAPPING_FLAGS;

	return (void *)mapping;
}

/* Neutral page->mapping pointer to address_space or anon_vma or other */
void *page_rmapping(struct page *page)
{
	page = compound_head(page);
	return __page_rmapping(page);
}

/*
 * Return true if this page is mapped into pagetables.
 * For compound page it returns true if any subpage of compound page is mapped.
 */
bool page_mapped(struct page *page)
{
	int i;

	if (likely(!PageCompound(page)))
		return atomic_read(&page->_mapcount) >= 0;
	page = compound_head(page);
	if (atomic_read(compound_mapcount_ptr(page)) >= 0)
		return true;
	if (PageHuge(page))
		return false;
	for (i = 0; i < hpage_nr_pages(page); i++) {
		if (atomic_read(&page[i]._mapcount) >= 0)
			return true;
	}
	return false;
}
EXPORT_SYMBOL(page_mapped);

struct anon_vma *page_anon_vma(struct page *page)
{
	unsigned long mapping;

	page = compound_head(page);
	mapping = (unsigned long)page->mapping;
	if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
		return NULL;
	return __page_rmapping(page);
}

struct address_space *page_mapping(struct page *page)
{
	struct address_space *mapping;

	page = compound_head(page);

	/* This happens if someone calls flush_dcache_page on slab page */
	if (unlikely(PageSlab(page)))
		return NULL;

	if (unlikely(PageSwapCache(page))) {
		swp_entry_t entry;

		entry.val = page_private(page);
		return swap_address_space(entry);
	}

	mapping = page->mapping;
	if ((unsigned long)mapping & PAGE_MAPPING_ANON)
		return NULL;

	return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS);
}
EXPORT_SYMBOL(page_mapping);

/* Slow path of page_mapcount() for compound pages */
int __page_mapcount(struct page *page)
{
	int ret;

	ret = atomic_read(&page->_mapcount) + 1;
	/*
	 * For file THP page->_mapcount contains total number of mapping
	 * of the page: no need to look into compound_mapcount.
	 */
	if (!PageAnon(page) && !PageHuge(page))
		return ret;
	page = compound_head(page);
	ret += atomic_read(compound_mapcount_ptr(page)) + 1;
	if (PageDoubleMap(page))
		ret--;
	return ret;
}
EXPORT_SYMBOL_GPL(__page_mapcount);

int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
int sysctl_overcommit_ratio __read_mostly = 50;
unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */

int overcommit_ratio_handler(struct ctl_table *table, int write,
			     void __user *buffer, size_t *lenp,
			     loff_t *ppos)
{
	int ret;

	ret = proc_dointvec(table, write, buffer, lenp, ppos);
	if (ret == 0 && write)
		sysctl_overcommit_kbytes = 0;
	return ret;
}

int overcommit_kbytes_handler(struct ctl_table *table, int write,
			     void __user *buffer, size_t *lenp,
			     loff_t *ppos)
{
	int ret;

	ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
	if (ret == 0 && write)
		sysctl_overcommit_ratio = 0;
	return ret;
}

/*
 * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
 */
unsigned long vm_commit_limit(void)
{
	unsigned long allowed;

	if (sysctl_overcommit_kbytes)
		allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
	else
		allowed = ((totalram_pages - hugetlb_total_pages())
			   * sysctl_overcommit_ratio / 100);
	allowed += total_swap_pages;

	return allowed;
}

/*
 * Make sure vm_committed_as in one cacheline and not cacheline shared with
 * other variables. It can be updated by several CPUs frequently.
 */
struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;

/*
 * The global memory commitment made in the system can be a metric
 * that can be used to drive ballooning decisions when Linux is hosted
 * as a guest. On Hyper-V, the host implements a policy engine for dynamically
 * balancing memory across competing virtual machines that are hosted.
 * Several metrics drive this policy engine including the guest reported
 * memory commitment.
 */
unsigned long vm_memory_committed(void)
{
	return percpu_counter_read_positive(&vm_committed_as);
}
EXPORT_SYMBOL_GPL(vm_memory_committed);

/*
 * Check that a process has enough memory to allocate a new virtual
 * mapping. 0 means there is enough memory for the allocation to
 * succeed and -ENOMEM implies there is not.
 *
 * We currently support three overcommit policies, which are set via the
 * vm.overcommit_memory sysctl.  See Documentation/vm/overcommit-accounting
 *
 * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
 * Additional code 2002 Jul 20 by Robert Love.
 *
 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
 *
 * Note this is a helper function intended to be used by LSMs which
 * wish to use this logic.
 */
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
	long free, allowed, reserve;

	VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
			-(s64)vm_committed_as_batch * num_online_cpus(),
			"memory commitment underflow");

	vm_acct_memory(pages);

	/*
	 * Sometimes we want to use more memory than we have
	 */
	if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
		return 0;

	if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
		free = global_page_state(NR_FREE_PAGES);
		free += global_node_page_state(NR_FILE_PAGES);

		/*
		 * shmem pages shouldn't be counted as free in this
		 * case, they can't be purged, only swapped out, and
		 * that won't affect the overall amount of available
		 * memory in the system.
		 */
		free -= global_node_page_state(NR_SHMEM);

		free += get_nr_swap_pages();

		/*
		 * Any slabs which are created with the
		 * SLAB_RECLAIM_ACCOUNT flag claim to have contents
		 * which are reclaimable, under pressure.  The dentry
		 * cache and most inode caches should fall into this
		 */
		free += global_node_page_state(NR_SLAB_RECLAIMABLE);

		/*
		 * Leave reserved pages. The pages are not for anonymous pages.
		 */
		if (free <= totalreserve_pages)
			goto error;
		else
			free -= totalreserve_pages;

		/*
		 * Reserve some for root
		 */
		if (!cap_sys_admin)
			free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);

		if (free > pages)
			return 0;

		goto error;
	}

	allowed = vm_commit_limit();
	/*
	 * Reserve some for root
	 */
	if (!cap_sys_admin)
		allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);

	/*
	 * Don't let a single process grow so big a user can't recover
	 */
	if (mm) {
		reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
		allowed -= min_t(long, mm->total_vm / 32, reserve);
	}

	if (percpu_counter_read_positive(&vm_committed_as) < allowed)
		return 0;
error:
	vm_unacct_memory(pages);

	return -ENOMEM;
}

/**
 * get_cmdline() - copy the cmdline value to a buffer.
 * @task:     the task whose cmdline value to copy.
 * @buffer:   the buffer to copy to.
 * @buflen:   the length of the buffer. Larger cmdline values are truncated
 *            to this length.
 * Returns the size of the cmdline field copied. Note that the copy does
 * not guarantee an ending NULL byte.
 */
int get_cmdline(struct task_struct *task, char *buffer, int buflen)
{
	int res = 0;
	unsigned int len;
	struct mm_struct *mm = get_task_mm(task);
	unsigned long arg_start, arg_end, env_start, env_end;
	if (!mm)
		goto out;
	if (!mm->arg_end)
		goto out_mm;	/* Shh! No looking before we're done */

	down_read(&mm->mmap_sem);
	arg_start = mm->arg_start;
	arg_end = mm->arg_end;
	env_start = mm->env_start;
	env_end = mm->env_end;
	up_read(&mm->mmap_sem);

	len = arg_end - arg_start;

	if (len > buflen)
		len = buflen;

	res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE);

	/*
	 * If the nul at the end of args has been overwritten, then
	 * assume application is using setproctitle(3).
	 */
	if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
		len = strnlen(buffer, res);
		if (len < res) {
			res = len;
		} else {
			len = env_end - env_start;
			if (len > buflen - res)
				len = buflen - res;
			res += access_process_vm(task, env_start,
						 buffer+res, len,
						 FOLL_FORCE);
			res = strnlen(buffer, res);
		}
	}
out_mm:
	mmput(mm);
out:
	return res;
}
Commit	Line	Data
	1	#include <linux/mm.h>
	2	#include <linux/slab.h>
	3	#include <linux/string.h>
	4	#include <linux/compiler.h>
	5	#include <linux/export.h>
	6	#include <linux/err.h>
	7	#include <linux/sched.h>
	8	#include <linux/sched/mm.h>
	9	#include <linux/sched/task_stack.h>
	10	#include <linux/security.h>
	11	#include <linux/swap.h>
	12	#include <linux/swapops.h>
	13	#include <linux/mman.h>
	14	#include <linux/hugetlb.h>
	15	#include <linux/vmalloc.h>
	16	#include <linux/userfaultfd_k.h>
	17
	18	#include <asm/sections.h>
	19	#include <linux/uaccess.h>
	20
	21	#include "internal.h"
	22
	23	static inline int is_kernel_rodata(unsigned long addr)
	24	{
	25	return addr >= (unsigned long)__start_rodata &&
	26	addr < (unsigned long)__end_rodata;
	27	}
	28
	29	/**
	30	* kfree_const - conditionally free memory
	31	* @x: pointer to the memory
	32	*
	33	* Function calls kfree only if @x is not in .rodata section.
	34	*/
	35	void kfree_const(const void *x)
	36	{
	37	if (!is_kernel_rodata((unsigned long)x))
	38	kfree(x);
	39	}
	40	EXPORT_SYMBOL(kfree_const);
	41
	42	/**
	43	* kstrdup - allocate space for and copy an existing string
	44	* @s: the string to duplicate
	45	* @gfp: the GFP mask used in the kmalloc() call when allocating memory
	46	*/
	47	char kstrdup(const char s, gfp_t gfp)
	48	{
	49	size_t len;
	50	char *buf;
	51
	52	if (!s)
	53	return NULL;
	54
	55	len = strlen(s) + 1;
	56	buf = kmalloc_track_caller(len, gfp);
	57	if (buf)
	58	memcpy(buf, s, len);
	59	return buf;
	60	}
	61	EXPORT_SYMBOL(kstrdup);
	62
	63	/**
	64	* kstrdup_const - conditionally duplicate an existing const string
	65	* @s: the string to duplicate
	66	* @gfp: the GFP mask used in the kmalloc() call when allocating memory
	67	*
	68	* Function returns source string if it is in .rodata section otherwise it
	69	* fallbacks to kstrdup.
	70	* Strings allocated by kstrdup_const should be freed by kfree_const.
	71	*/
	72	const char kstrdup_const(const char s, gfp_t gfp)
	73	{
	74	if (is_kernel_rodata((unsigned long)s))
	75	return s;
	76
	77	return kstrdup(s, gfp);
	78	}
	79	EXPORT_SYMBOL(kstrdup_const);
	80
	81	/**
	82	* kstrndup - allocate space for and copy an existing string
	83	* @s: the string to duplicate
	84	* @max: read at most @max chars from @s
	85	* @gfp: the GFP mask used in the kmalloc() call when allocating memory
	86	*
	87	* Note: Use kmemdup_nul() instead if the size is known exactly.
	88	*/
	89	char kstrndup(const char s, size_t max, gfp_t gfp)
	90	{
	91	size_t len;
	92	char *buf;
	93
	94	if (!s)
	95	return NULL;
	96
	97	len = strnlen(s, max);
	98	buf = kmalloc_track_caller(len+1, gfp);
	99	if (buf) {
	100	memcpy(buf, s, len);
	101	buf[len] = '\0';
	102	}
	103	return buf;
	104	}
	105	EXPORT_SYMBOL(kstrndup);
	106
	107	/**
	108	* kmemdup - duplicate region of memory
	109	*
	110	* @src: memory region to duplicate
	111	* @len: memory region length
	112	* @gfp: GFP mask to use
	113	*/
	114	void kmemdup(const void src, size_t len, gfp_t gfp)
	115	{
	116	void *p;
	117
	118	p = kmalloc_track_caller(len, gfp);
	119	if (p)
	120	memcpy(p, src, len);
	121	return p;
	122	}
	123	EXPORT_SYMBOL(kmemdup);
	124
	125	/**
	126	* kmemdup_nul - Create a NUL-terminated string from unterminated data
	127	* @s: The data to stringify
	128	* @len: The size of the data
	129	* @gfp: the GFP mask used in the kmalloc() call when allocating memory
	130	*/
	131	char kmemdup_nul(const char s, size_t len, gfp_t gfp)
	132	{
	133	char *buf;
	134
	135	if (!s)
	136	return NULL;
	137
	138	buf = kmalloc_track_caller(len + 1, gfp);
	139	if (buf) {
	140	memcpy(buf, s, len);
	141	buf[len] = '\0';
	142	}
	143	return buf;
	144	}
	145	EXPORT_SYMBOL(kmemdup_nul);
	146
	147	/**
	148	* memdup_user - duplicate memory region from user space
	149	*
	150	* @src: source address in user space
	151	* @len: number of bytes to copy
	152	*
	153	* Returns an ERR_PTR() on failure.
	154	*/
	155	void memdup_user(const void __user src, size_t len)
	156	{
	157	void *p;
	158
	159	/*
	160	* Always use GFP_KERNEL, since copy_from_user() can sleep and
	161	* cause pagefault, which makes it pointless to use GFP_NOFS
	162	* or GFP_ATOMIC.
	163	*/
	164	p = kmalloc_track_caller(len, GFP_KERNEL);
	165	if (!p)
	166	return ERR_PTR(-ENOMEM);
	167
	168	if (copy_from_user(p, src, len)) {
	169	kfree(p);
	170	return ERR_PTR(-EFAULT);
	171	}
	172
	173	return p;
	174	}
	175	EXPORT_SYMBOL(memdup_user);
	176
	177	/*
	178	* strndup_user - duplicate an existing string from user space
	179	* @s: The string to duplicate
	180	* @n: Maximum number of bytes to copy, including the trailing NUL.
	181	*/
	182	char strndup_user(const char __user s, long n)
	183	{
	184	char *p;
	185	long length;
	186
	187	length = strnlen_user(s, n);
	188
	189	if (!length)
	190	return ERR_PTR(-EFAULT);
	191
	192	if (length > n)
	193	return ERR_PTR(-EINVAL);
	194
	195	p = memdup_user(s, length);
	196
	197	if (IS_ERR(p))
	198	return p;
	199
	200	p[length - 1] = '\0';
	201
	202	return p;
	203	}
	204	EXPORT_SYMBOL(strndup_user);
	205
	206	/**
	207	* memdup_user_nul - duplicate memory region from user space and NUL-terminate
	208	*
	209	* @src: source address in user space
	210	* @len: number of bytes to copy
	211	*
	212	* Returns an ERR_PTR() on failure.
	213	*/
	214	void memdup_user_nul(const void __user src, size_t len)
	215	{
	216	char *p;
	217
	218	/*
	219	* Always use GFP_KERNEL, since copy_from_user() can sleep and
	220	* cause pagefault, which makes it pointless to use GFP_NOFS
	221	* or GFP_ATOMIC.
	222	*/
	223	p = kmalloc_track_caller(len + 1, GFP_KERNEL);
	224	if (!p)
	225	return ERR_PTR(-ENOMEM);
	226
	227	if (copy_from_user(p, src, len)) {
	228	kfree(p);
	229	return ERR_PTR(-EFAULT);
	230	}
	231	p[len] = '\0';
	232
	233	return p;
	234	}
	235	EXPORT_SYMBOL(memdup_user_nul);
	236
	237	void __vma_link_list(struct mm_struct mm, struct vm_area_struct vma,
	238	struct vm_area_struct prev, struct rb_node rb_parent)
	239	{
	240	struct vm_area_struct *next;
	241
	242	vma->vm_prev = prev;
	243	if (prev) {
	244	next = prev->vm_next;
	245	prev->vm_next = vma;
	246	} else {
	247	mm->mmap = vma;
	248	if (rb_parent)
	249	next = rb_entry(rb_parent,
	250	struct vm_area_struct, vm_rb);
	251	else
	252	next = NULL;
	253	}
	254	vma->vm_next = next;
	255	if (next)
	256	next->vm_prev = vma;
	257	}
	258
	259	/* Check if the vma is being used as a stack by this task */
	260	int vma_is_stack_for_current(struct vm_area_struct *vma)
	261	{
	262	struct task_struct * __maybe_unused t = current;
	263
	264	return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
	265	}
	266
	267	#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
	268	void arch_pick_mmap_layout(struct mm_struct *mm)
	269	{
	270	mm->mmap_base = TASK_UNMAPPED_BASE;
	271	mm->get_unmapped_area = arch_get_unmapped_area;
	272	}
	273	#endif
	274
	275	/*
	276	* Like get_user_pages_fast() except its IRQ-safe in that it won't fall
	277	* back to the regular GUP.
	278	* If the architecture not support this function, simply return with no
	279	* page pinned
	280	*/
	281	int __weak __get_user_pages_fast(unsigned long start,
	282	int nr_pages, int write, struct page **pages)
	283	{
	284	return 0;
	285	}
	286	EXPORT_SYMBOL_GPL(__get_user_pages_fast);
	287
	288	/**
	289	* get_user_pages_fast() - pin user pages in memory
	290	* @start: starting user address
	291	* @nr_pages: number of pages from start to pin
	292	* @write: whether pages will be written to
	293	* @pages: array that receives pointers to the pages pinned.
	294	* Should be at least nr_pages long.
	295	*
	296	* Returns number of pages pinned. This may be fewer than the number
	297	* requested. If nr_pages is 0 or negative, returns 0. If no pages
	298	* were pinned, returns -errno.
	299	*
	300	* get_user_pages_fast provides equivalent functionality to get_user_pages,
	301	* operating on current and current->mm, with force=0 and vma=NULL. However
	302	* unlike get_user_pages, it must be called without mmap_sem held.
	303	*
	304	* get_user_pages_fast may take mmap_sem and page table locks, so no
	305	* assumptions can be made about lack of locking. get_user_pages_fast is to be
	306	* implemented in a way that is advantageous (vs get_user_pages()) when the
	307	* user memory area is already faulted in and present in ptes. However if the
	308	* pages have to be faulted in, it may turn out to be slightly slower so
	309	* callers need to carefully consider what to use. On many architectures,
	310	* get_user_pages_fast simply falls back to get_user_pages.
	311	*/
	312	int __weak get_user_pages_fast(unsigned long start,
	313	int nr_pages, int write, struct page **pages)
	314	{
	315	return get_user_pages_unlocked(start, nr_pages, pages,
	316	write ? FOLL_WRITE : 0);
	317	}
	318	EXPORT_SYMBOL_GPL(get_user_pages_fast);
	319
	320	unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
	321	unsigned long len, unsigned long prot,
	322	unsigned long flag, unsigned long pgoff)
	323	{
	324	unsigned long ret;
	325	struct mm_struct *mm = current->mm;
	326	unsigned long populate;
	327	LIST_HEAD(uf);
	328
	329	ret = security_mmap_file(file, prot, flag);
	330	if (!ret) {
	331	if (down_write_killable(&mm->mmap_sem))
	332	return -EINTR;
	333	ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
	334	&populate, &uf);
	335	up_write(&mm->mmap_sem);
	336	userfaultfd_unmap_complete(mm, &uf);
	337	if (populate)
	338	mm_populate(ret, populate);
	339	}
	340	return ret;
	341	}
	342
	343	unsigned long vm_mmap(struct file *file, unsigned long addr,
	344	unsigned long len, unsigned long prot,
	345	unsigned long flag, unsigned long offset)
	346	{
	347	if (unlikely(offset + PAGE_ALIGN(len) < offset))
	348	return -EINVAL;
	349	if (unlikely(offset_in_page(offset)))
	350	return -EINVAL;
	351
	352	return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
	353	}
	354	EXPORT_SYMBOL(vm_mmap);
	355
	356	/**
	357	* kvmalloc_node - attempt to allocate physically contiguous memory, but upon
	358	* failure, fall back to non-contiguous (vmalloc) allocation.
	359	* @size: size of the request.
	360	* @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
	361	* @node: numa node to allocate from
	362	*
	363	* Uses kmalloc to get the memory but if the allocation fails then falls back
	364	* to the vmalloc allocator. Use kvfree for freeing the memory.
	365	*
	366	* Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported.
	367	* __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
	368	* preferable to the vmalloc fallback, due to visible performance drawbacks.
	369	*
	370	* Any use of gfp flags outside of GFP_KERNEL should be consulted with mm people.
	371	*/
	372	void *kvmalloc_node(size_t size, gfp_t flags, int node)
	373	{
	374	gfp_t kmalloc_flags = flags;
	375	void *ret;
	376
	377	/*
	378	* vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables)
	379	* so the given set of flags has to be compatible.
	380	*/
	381	WARN_ON_ONCE((flags & GFP_KERNEL) != GFP_KERNEL);
	382
	383	/*
	384	* We want to attempt a large physically contiguous block first because
	385	* it is less likely to fragment multiple larger blocks and therefore
	386	* contribute to a long term fragmentation less than vmalloc fallback.
	387	* However make sure that larger requests are not too disruptive - no
	388	* OOM killer and no allocation failure warnings as we have a fallback.
	389	*/
	390	if (size > PAGE_SIZE) {
	391	kmalloc_flags \|= __GFP_NOWARN;
	392
	393	if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL))
	394	kmalloc_flags \|= __GFP_NORETRY;
	395	}
	396
	397	ret = kmalloc_node(size, kmalloc_flags, node);
	398
	399	/*
	400	* It doesn't really make sense to fallback to vmalloc for sub page
	401	* requests
	402	*/
	403	if (ret \|\| size <= PAGE_SIZE)
	404	return ret;
	405
	406	return __vmalloc_node_flags_caller(size, node, flags,
	407	__builtin_return_address(0));
	408	}
	409	EXPORT_SYMBOL(kvmalloc_node);
	410
	411	void kvfree(const void *addr)
	412	{
	413	if (is_vmalloc_addr(addr))
	414	vfree(addr);
	415	else
	416	kfree(addr);
	417	}
	418	EXPORT_SYMBOL(kvfree);
	419
	420	static inline void __page_rmapping(struct page page)
	421	{
	422	unsigned long mapping;
	423
	424	mapping = (unsigned long)page->mapping;
	425	mapping &= ~PAGE_MAPPING_FLAGS;
	426
	427	return (void *)mapping;
	428	}
	429
	430	/* Neutral page->mapping pointer to address_space or anon_vma or other */
	431	void page_rmapping(struct page page)
	432	{
	433	page = compound_head(page);
	434	return __page_rmapping(page);
	435	}
	436
	437	/*
	438	* Return true if this page is mapped into pagetables.
	439	* For compound page it returns true if any subpage of compound page is mapped.
	440	*/
	441	bool page_mapped(struct page *page)
	442	{
	443	int i;
	444
	445	if (likely(!PageCompound(page)))
	446	return atomic_read(&page->_mapcount) >= 0;
	447	page = compound_head(page);
	448	if (atomic_read(compound_mapcount_ptr(page)) >= 0)
	449	return true;
	450	if (PageHuge(page))
	451	return false;
	452	for (i = 0; i < hpage_nr_pages(page); i++) {
	453	if (atomic_read(&page[i]._mapcount) >= 0)
	454	return true;
	455	}
	456	return false;
	457	}
	458	EXPORT_SYMBOL(page_mapped);
	459
	460	struct anon_vma page_anon_vma(struct page page)
	461	{
	462	unsigned long mapping;
	463
	464	page = compound_head(page);
	465	mapping = (unsigned long)page->mapping;
	466	if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
	467	return NULL;
	468	return __page_rmapping(page);
	469	}
	470
	471	struct address_space page_mapping(struct page page)
	472	{
	473	struct address_space *mapping;
	474
	475	page = compound_head(page);
	476
	477	/* This happens if someone calls flush_dcache_page on slab page */
	478	if (unlikely(PageSlab(page)))
	479	return NULL;
	480
	481	if (unlikely(PageSwapCache(page))) {
	482	swp_entry_t entry;
	483
	484	entry.val = page_private(page);
	485	return swap_address_space(entry);
	486	}
	487
	488	mapping = page->mapping;
	489	if ((unsigned long)mapping & PAGE_MAPPING_ANON)
	490	return NULL;
	491
	492	return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS);
	493	}
	494	EXPORT_SYMBOL(page_mapping);
	495
	496	/* Slow path of page_mapcount() for compound pages */
	497	int __page_mapcount(struct page *page)
	498	{
	499	int ret;
	500
	501	ret = atomic_read(&page->_mapcount) + 1;
	502	/*
	503	* For file THP page->_mapcount contains total number of mapping
	504	* of the page: no need to look into compound_mapcount.
	505	*/
	506	if (!PageAnon(page) && !PageHuge(page))
	507	return ret;
	508	page = compound_head(page);
	509	ret += atomic_read(compound_mapcount_ptr(page)) + 1;
	510	if (PageDoubleMap(page))
	511	ret--;
	512	return ret;
	513	}
	514	EXPORT_SYMBOL_GPL(__page_mapcount);
	515
	516	int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
	517	int sysctl_overcommit_ratio __read_mostly = 50;
	518	unsigned long sysctl_overcommit_kbytes __read_mostly;
	519	int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
	520	unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
	521	unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
	522
	523	int overcommit_ratio_handler(struct ctl_table *table, int write,
	524	void __user buffer, size_t lenp,
	525	loff_t *ppos)
	526	{
	527	int ret;
	528
	529	ret = proc_dointvec(table, write, buffer, lenp, ppos);
	530	if (ret == 0 && write)
	531	sysctl_overcommit_kbytes = 0;
	532	return ret;
	533	}
	534
	535	int overcommit_kbytes_handler(struct ctl_table *table, int write,
	536	void __user buffer, size_t lenp,
	537	loff_t *ppos)
	538	{
	539	int ret;
	540
	541	ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
	542	if (ret == 0 && write)
	543	sysctl_overcommit_ratio = 0;
	544	return ret;
	545	}
	546
	547	/*
	548	* Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
	549	*/
	550	unsigned long vm_commit_limit(void)
	551	{
	552	unsigned long allowed;
	553
	554	if (sysctl_overcommit_kbytes)
	555	allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
	556	else
	557	allowed = ((totalram_pages - hugetlb_total_pages())
	558	* sysctl_overcommit_ratio / 100);
	559	allowed += total_swap_pages;
	560
	561	return allowed;
	562	}
	563
	564	/*
	565	* Make sure vm_committed_as in one cacheline and not cacheline shared with
	566	* other variables. It can be updated by several CPUs frequently.
	567	*/
	568	struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
	569
	570	/*
	571	* The global memory commitment made in the system can be a metric
	572	* that can be used to drive ballooning decisions when Linux is hosted
	573	* as a guest. On Hyper-V, the host implements a policy engine for dynamically
	574	* balancing memory across competing virtual machines that are hosted.
	575	* Several metrics drive this policy engine including the guest reported
	576	* memory commitment.
	577	*/
	578	unsigned long vm_memory_committed(void)
	579	{
	580	return percpu_counter_read_positive(&vm_committed_as);
	581	}
	582	EXPORT_SYMBOL_GPL(vm_memory_committed);
	583
	584	/*
	585	* Check that a process has enough memory to allocate a new virtual
	586	* mapping. 0 means there is enough memory for the allocation to
	587	* succeed and -ENOMEM implies there is not.
	588	*
	589	* We currently support three overcommit policies, which are set via the
	590	* vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
	591	*
	592	* Strict overcommit modes added 2002 Feb 26 by Alan Cox.
	593	* Additional code 2002 Jul 20 by Robert Love.
	594	*
	595	* cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
	596	*
	597	* Note this is a helper function intended to be used by LSMs which
	598	* wish to use this logic.
	599	*/
	600	int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
	601	{
	602	long free, allowed, reserve;
	603
	604	VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
	605	-(s64)vm_committed_as_batch * num_online_cpus(),
	606	"memory commitment underflow");
	607
	608	vm_acct_memory(pages);
	609
	610	/*
	611	* Sometimes we want to use more memory than we have
	612	*/
	613	if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
	614	return 0;
	615
	616	if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
	617	free = global_page_state(NR_FREE_PAGES);
	618	free += global_node_page_state(NR_FILE_PAGES);
	619
	620	/*
	621	* shmem pages shouldn't be counted as free in this
	622	* case, they can't be purged, only swapped out, and
	623	* that won't affect the overall amount of available
	624	* memory in the system.
	625	*/
	626	free -= global_node_page_state(NR_SHMEM);
	627
	628	free += get_nr_swap_pages();
	629
	630	/*
	631	* Any slabs which are created with the
	632	* SLAB_RECLAIM_ACCOUNT flag claim to have contents
	633	* which are reclaimable, under pressure. The dentry
	634	* cache and most inode caches should fall into this
	635	*/
	636	free += global_node_page_state(NR_SLAB_RECLAIMABLE);
	637
	638	/*
	639	* Leave reserved pages. The pages are not for anonymous pages.
	640	*/
	641	if (free <= totalreserve_pages)
	642	goto error;
	643	else
	644	free -= totalreserve_pages;
	645
	646	/*
	647	* Reserve some for root
	648	*/
	649	if (!cap_sys_admin)
	650	free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
	651
	652	if (free > pages)
	653	return 0;
	654
	655	goto error;
	656	}
	657
	658	allowed = vm_commit_limit();
	659	/*
	660	* Reserve some for root
	661	*/
	662	if (!cap_sys_admin)
	663	allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
	664
	665	/*
	666	* Don't let a single process grow so big a user can't recover
	667	*/
	668	if (mm) {
	669	reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
	670	allowed -= min_t(long, mm->total_vm / 32, reserve);
	671	}
	672
	673	if (percpu_counter_read_positive(&vm_committed_as) < allowed)
	674	return 0;
	675	error:
	676	vm_unacct_memory(pages);
	677
	678	return -ENOMEM;
	679	}
	680
	681	/**
	682	* get_cmdline() - copy the cmdline value to a buffer.
	683	* @task: the task whose cmdline value to copy.
	684	* @buffer: the buffer to copy to.
	685	* @buflen: the length of the buffer. Larger cmdline values are truncated
	686	* to this length.
	687	* Returns the size of the cmdline field copied. Note that the copy does
	688	* not guarantee an ending NULL byte.
	689	*/
	690	int get_cmdline(struct task_struct task, char buffer, int buflen)
	691	{
	692	int res = 0;
	693	unsigned int len;
	694	struct mm_struct *mm = get_task_mm(task);
	695	unsigned long arg_start, arg_end, env_start, env_end;
	696	if (!mm)
	697	goto out;
	698	if (!mm->arg_end)
	699	goto out_mm; /* Shh! No looking before we're done */
	700
	701	down_read(&mm->mmap_sem);
	702	arg_start = mm->arg_start;
	703	arg_end = mm->arg_end;
	704	env_start = mm->env_start;
	705	env_end = mm->env_end;
	706	up_read(&mm->mmap_sem);
	707
	708	len = arg_end - arg_start;
	709
	710	if (len > buflen)
	711	len = buflen;
	712
	713	res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE);
	714
	715	/*
	716	* If the nul at the end of args has been overwritten, then
	717	* assume application is using setproctitle(3).
	718	*/
	719	if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
	720	len = strnlen(buffer, res);
	721	if (len < res) {
	722	res = len;
	723	} else {
	724	len = env_end - env_start;
	725	if (len > buflen - res)
	726	len = buflen - res;
	727	res += access_process_vm(task, env_start,
	728	buffer+res, len,
	729	FOLL_FORCE);
	730	res = strnlen(buffer, res);
	731	}
	732	}
	733	out_mm:
	734	mmput(mm);
	735	out:
	736	return res;
	737	}