switch (type) {
case ACL_TYPE_ACCESS:
name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
- if (acl) {
- umode_t mode;
-
- ret = posix_acl_update_mode(inode, &mode, &acl);
- if (ret)
- return ret;
-
- ret = ocfs2_acl_set_mode(inode, di_bh,
- handle, mode);
- if (ret)
- return ret;
- }
break;
case ACL_TYPE_DEFAULT:
name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh);
if (had_lock < 0)
return had_lock;
+ if (type == ACL_TYPE_ACCESS && acl) {
+ umode_t mode;
+
+ status = posix_acl_update_mode(inode, &mode, &acl);
+ if (status)
+ goto unlock;
+
+ status = ocfs2_acl_set_mode(inode, bh, NULL, mode);
+ if (status)
+ goto unlock;
+ }
status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
+unlock:
ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
brelse(bh);
return status;
__wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, &range);
spin_unlock(&ctx->fault_pending_wqh.lock);
+ /* Flush pending events that may still wait on event_wqh */
+ wake_up_all(&ctx->event_wqh);
+
wake_up_poll(&ctx->fd_wqh, POLLHUP);
userfaultfd_ctx_put(ctx);
return 0;
ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
uffdio_zeropage.range.len);
mmput(ctx->mm);
+ } else {
+ return -ENOSPC;
}
if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
return -EFAULT;
#ifdef CONFIG_CPUSETS
+/*
+ * Static branch rewrites can happen in an arbitrary order for a given
+ * key. In code paths where we need to loop with read_mems_allowed_begin() and
+ * read_mems_allowed_retry() to get a consistent view of mems_allowed, we need
+ * to ensure that begin() always gets rewritten before retry() in the
+ * disabled -> enabled transition. If not, then if local irqs are disabled
+ * around the loop, we can deadlock since retry() would always be
+ * comparing the latest value of the mems_allowed seqcount against 0 as
+ * begin() still would see cpusets_enabled() as false. The enabled -> disabled
+ * transition should happen in reverse order for the same reasons (want to stop
+ * looking at real value of mems_allowed.sequence in retry() first).
+ */
+extern struct static_key_false cpusets_pre_enable_key;
extern struct static_key_false cpusets_enabled_key;
static inline bool cpusets_enabled(void)
{
static inline void cpuset_inc(void)
{
+ static_branch_inc(&cpusets_pre_enable_key);
static_branch_inc(&cpusets_enabled_key);
}
static inline void cpuset_dec(void)
{
static_branch_dec(&cpusets_enabled_key);
+ static_branch_dec(&cpusets_pre_enable_key);
}
extern int cpuset_init(void);
*/
static inline unsigned int read_mems_allowed_begin(void)
{
- if (!cpusets_enabled())
+ if (!static_branch_unlikely(&cpusets_pre_enable_key))
return 0;
return read_seqcount_begin(¤t->mems_allowed_seq);
*/
static inline bool read_mems_allowed_retry(unsigned int seq)
{
- if (!cpusets_enabled())
+ if (!static_branch_unlikely(&cpusets_enabled_key))
return false;
return read_seqcount_retry(¤t->mems_allowed_seq, seq);
* @threadfn: the function to run in the thread
* @data: data pointer for @threadfn()
* @namefmt: printf-style format string for the thread name
- * @...: arguments for @namefmt.
+ * @arg...: arguments for @namefmt.
*
* This macro will create a kthread on the current node, leaving it in
* the stopped state. This is just a helper for kthread_create_on_node();
* PROT_NONE or PROT_NUMA mapped page.
*/
bool tlb_flush_pending;
+#endif
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+ /* See flush_tlb_batched_pending() */
+ bool tlb_flush_batched;
#endif
struct uprobes_state uprobes_state;
#ifdef CONFIG_HUGETLB_PAGE
*/
static inline int page_cache_get_speculative(struct page *page)
{
- VM_BUG_ON(in_interrupt());
-
#ifdef CONFIG_TINY_RCU
# ifdef CONFIG_PREEMPT_COUNT
VM_BUG_ON(!in_atomic() && !irqs_disabled());
static int sysvipc_msg_proc_show(struct seq_file *s, void *it)
{
struct user_namespace *user_ns = seq_user_ns(s);
- struct msg_queue *msq = it;
+ struct kern_ipc_perm *ipcp = it;
+ struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm);
seq_printf(s,
"%10d %10d %4o %10lu %10lu %5u %5u %5u %5u %5u %5u %10lu %10lu %10lu\n",
static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
{
struct user_namespace *user_ns = seq_user_ns(s);
- struct sem_array *sma = it;
+ struct kern_ipc_perm *ipcp = it;
+ struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
time_t sem_otime;
/*
static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
{
struct user_namespace *user_ns = seq_user_ns(s);
- struct shmid_kernel *shp = it;
+ struct kern_ipc_perm *ipcp = it;
+ struct shmid_kernel *shp;
unsigned long rss = 0, swp = 0;
+ shp = container_of(ipcp, struct shmid_kernel, shm_perm);
shm_add_rss_swap(shp, &rss, &swp);
#if BITS_PER_LONG <= 32
#include <linux/cgroup.h>
#include <linux/wait.h>
+DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
/* See "Frequency meter" comments, below. */
*/
void __init pidhash_init(void)
{
- unsigned int pidhash_size;
-
pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
HASH_EARLY | HASH_SMALL | HASH_ZERO,
&pidhash_shift, NULL,
0, 4096);
- pidhash_size = 1U << pidhash_shift;
}
void __init pidmap_init(void)
unsigned long vaddr = *position;
unsigned long remainder = *nr_pages;
struct hstate *h = hstate_vma(vma);
+ int err = -EFAULT;
while (vaddr < vma->vm_end && remainder) {
pte_t *pte;
}
ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
if (ret & VM_FAULT_ERROR) {
- int err = vm_fault_to_errno(ret, flags);
-
- if (err)
- return err;
-
+ err = vm_fault_to_errno(ret, flags);
remainder = 0;
break;
}
*/
*position = vaddr;
- return i ? i : -EFAULT;
+ return i ? i : err;
}
#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
void try_to_unmap_flush(void);
void try_to_unmap_flush_dirty(void);
+void flush_tlb_batched_pending(struct mm_struct *mm);
#else
static inline void try_to_unmap_flush(void)
{
static inline void try_to_unmap_flush_dirty(void)
{
}
-
+static inline void flush_tlb_batched_pending(struct mm_struct *mm)
+{
+}
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
extern const struct trace_print_flags pageflag_names[];
disable_trace_on_warning();
info.access_addr = (void *)addr;
+ info.first_bad_addr = (void *)addr;
info.access_size = size;
info.is_write = is_write;
info.ip = ip;
tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
for (; addr != end; pte++, addr += PAGE_SIZE) {
ptent = *pte;
init_rss_vec(rss);
start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
pte = start_pte;
+ flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
do {
pte_t ptent = *pte;
atomic_read(&vma->vm_mm->mm_users) == 1)
target_node = numa_node_id();
+ flush_tlb_batched_pending(vma->vm_mm);
arch_enter_lazy_mmu_mode();
do {
oldpte = *pte;
new_ptl = pte_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+ flush_tlb_batched_pending(vma->vm_mm);
arch_enter_lazy_mmu_mode();
for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
unsigned long new_addr, unsigned long new_len, bool *locked,
struct vm_userfaultfd_ctx *uf,
+ struct list_head *uf_unmap_early,
struct list_head *uf_unmap)
{
struct mm_struct *mm = current->mm;
if (addr + old_len > new_addr && new_addr + new_len > addr)
goto out;
- ret = do_munmap(mm, new_addr, new_len, NULL);
+ ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
if (ret)
goto out;
unsigned long charged = 0;
bool locked = false;
struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
+ LIST_HEAD(uf_unmap_early);
LIST_HEAD(uf_unmap);
if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
if (flags & MREMAP_FIXED) {
ret = mremap_to(addr, old_len, new_addr, new_len,
- &locked, &uf, &uf_unmap);
+ &locked, &uf, &uf_unmap_early, &uf_unmap);
goto out;
}
up_write(¤t->mm->mmap_sem);
if (locked && new_len > old_len)
mm_populate(new_addr + old_len, new_len - old_len);
+ userfaultfd_unmap_complete(mm, &uf_unmap_early);
mremap_userfaultfd_complete(&uf, addr, new_addr, old_len);
userfaultfd_unmap_complete(mm, &uf_unmap);
return ret;
NUMA_ZONELIST_ORDER_LEN);
user_zonelist_order = oldval;
} else if (oldval != user_zonelist_order) {
+ mem_hotplug_begin();
mutex_lock(&zonelists_mutex);
build_all_zonelists(NULL, NULL);
mutex_unlock(&zonelists_mutex);
+ mem_hotplug_done();
}
}
out:
#include <linux/frontswap.h>
#include <linux/blkdev.h>
#include <linux/uio.h>
+#include <linux/sched/task.h>
#include <asm/pgtable.h>
static struct bio *get_swap_bio(gfp_t gfp_flags,
WRITE_ONCE(bio->bi_private, NULL);
bio_put(bio);
wake_up_process(waiter);
+ put_task_struct(waiter);
}
int generic_swapfile_activate(struct swap_info_struct *sis,
goto out;
}
bdev = bio->bi_bdev;
+ /*
+ * Keep this task valid during swap readpage because the oom killer may
+ * attempt to access it in the page fault retry time check.
+ */
+ get_task_struct(current);
bio->bi_private = current;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
count_vm_event(PSWPIN);
arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
tlb_ubc->flush_required = true;
+ /*
+ * Ensure compiler does not re-order the setting of tlb_flush_batched
+ * before the PTE is cleared.
+ */
+ barrier();
+ mm->tlb_flush_batched = true;
+
/*
* If the PTE was dirty then it's best to assume it's writable. The
* caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
return should_defer;
}
+
+/*
+ * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
+ * releasing the PTL if TLB flushes are batched. It's possible for a parallel
+ * operation such as mprotect or munmap to race between reclaim unmapping
+ * the page and flushing the page. If this race occurs, it potentially allows
+ * access to data via a stale TLB entry. Tracking all mm's that have TLB
+ * batching in flight would be expensive during reclaim so instead track
+ * whether TLB batching occurred in the past and if so then do a flush here
+ * if required. This will cost one additional flush per reclaim cycle paid
+ * by the first operation at risk such as mprotect and mumap.
+ *
+ * This must be called under the PTL so that an access to tlb_flush_batched
+ * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
+ * via the PTL.
+ */
+void flush_tlb_batched_pending(struct mm_struct *mm)
+{
+ if (mm->tlb_flush_batched) {
+ flush_tlb_mm(mm);
+
+ /*
+ * Do not allow the compiler to re-order the clearing of
+ * tlb_flush_batched before the tlb is flushed.
+ */
+ barrier();
+ mm->tlb_flush_batched = false;
+ }
+}
#else
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
{
}
destroy_cache(pool);
- kfree(pool->size_class);
kfree(pool->name);
kfree(pool);
}