mm/mprotect.c

   1 /*
   2  *  mm/mprotect.c
   3  *
   4  *  (C) Copyright 1994 Linus Torvalds
   5  *  (C) Copyright 2002 Christoph Hellwig
   6  *
   7  *  Address space accounting code       <alan@lxorguk.ukuu.org.uk>
   8  *  (C) Copyright 2002 Red Hat Inc, All Rights Reserved
   9  */
  10
  11 #include <linux/mm.h>
  12 #include <linux/hugetlb.h>
  13 #include <linux/shm.h>
  14 #include <linux/mman.h>
  15 #include <linux/fs.h>
  16 #include <linux/highmem.h>
  17 #include <linux/security.h>
  18 #include <linux/mempolicy.h>
  19 #include <linux/personality.h>
  20 #include <linux/syscalls.h>
  21 #include <linux/swap.h>
  22 #include <linux/swapops.h>
  23 #include <linux/mmu_notifier.h>
  24 #include <linux/migrate.h>
  25 #include <linux/perf_event.h>
  26 #include <asm/uaccess.h>
  27 #include <asm/pgtable.h>
  28 #include <asm/cacheflush.h>
  29 #include <asm/tlbflush.h>
  30
  31 #ifndef pgprot_modify
  32 static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
  33 {
  34         return newprot;
  35 }
  36 #endif
  37
  38 static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
  39                 unsigned long addr, unsigned long end, pgprot_t newprot,
  40                 int dirty_accountable, int prot_numa, bool *ret_all_same_node)
  41 {
  42         struct mm_struct *mm = vma->vm_mm;
  43         pte_t *pte, oldpte;
  44         spinlock_t *ptl;
  45         unsigned long pages = 0;
  46         bool all_same_node = true;
  47         int last_nid = -1;
  48
  49         pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
  50         arch_enter_lazy_mmu_mode();
  51         do {
  52                 oldpte = *pte;
  53                 if (pte_present(oldpte)) {
  54                         pte_t ptent;
  55                         bool updated = false;
  56
  57                         ptent = ptep_modify_prot_start(mm, addr, pte);
  58                         if (!prot_numa) {
  59                                 ptent = pte_modify(ptent, newprot);
  60                                 updated = true;
  61                         } else {
  62                                 struct page *page;
  63
  64                                 page = vm_normal_page(vma, addr, oldpte);
  65                                 if (page) {
  66                                         int this_nid = page_to_nid(page);
  67                                         if (last_nid == -1)
  68                                                 last_nid = this_nid;
  69                                         if (last_nid != this_nid)
  70                                                 all_same_node = false;
  71
  72                                         /* only check non-shared pages */
  73                                         if (!pte_numa(oldpte) &&
  74                                             page_mapcount(page) == 1) {
  75                                                 ptent = pte_mknuma(ptent);
  76                                                 updated = true;
  77                                         }
  78                                 }
  79                         }
  80
  81                         /*
  82                          * Avoid taking write faults for pages we know to be
  83                          * dirty.
  84                          */
  85                         if (dirty_accountable && pte_dirty(ptent)) {
  86                                 ptent = pte_mkwrite(ptent);
  87                                 updated = true;
  88                         }
  89
  90                         if (updated)
  91                                 pages++;
  92                         ptep_modify_prot_commit(mm, addr, pte, ptent);
  93                 } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
  94                         swp_entry_t entry = pte_to_swp_entry(oldpte);
  95
  96                         if (is_write_migration_entry(entry)) {
  97                                 /*
  98                                  * A protection check is difficult so
  99                                  * just be safe and disable write
 100                                  */
 101                                 make_migration_entry_read(&entry);
 102                                 set_pte_at(mm, addr, pte,
 103                                         swp_entry_to_pte(entry));
 104
 105                                 pages++;
 106                         }
 107                 }
 108         } while (pte++, addr += PAGE_SIZE, addr != end);
 109         arch_leave_lazy_mmu_mode();
 110         pte_unmap_unlock(pte - 1, ptl);
 111
 112         *ret_all_same_node = all_same_node;
 113         return pages;
 114 }
 115
 116 #ifdef CONFIG_NUMA_BALANCING
 117 static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
 118                                        pmd_t *pmd)
 119 {
 120         spin_lock(&mm->page_table_lock);
 121         set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
 122         spin_unlock(&mm->page_table_lock);
 123 }
 124 #else
 125 static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
 126                                        pmd_t *pmd)
 127 {
 128         BUG();
 129 }
 130 #endif /* CONFIG_NUMA_BALANCING */
 131
 132 static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
 133                 pud_t *pud, unsigned long addr, unsigned long end,
 134                 pgprot_t newprot, int dirty_accountable, int prot_numa)
 135 {
 136         pmd_t *pmd;
 137         unsigned long next;
 138         unsigned long pages = 0;
 139         bool all_same_node;
 140
 141         pmd = pmd_offset(pud, addr);
 142         do {
 143                 next = pmd_addr_end(addr, end);
 144                 if (pmd_trans_huge(*pmd)) {
 145                         if (next - addr != HPAGE_PMD_SIZE)
 146                                 split_huge_page_pmd(vma, addr, pmd);
 147                         else if (change_huge_pmd(vma, pmd, addr, newprot,
 148                                                  prot_numa)) {
 149                                 pages++;
 150                                 continue;
 151                         }
 152                         /* fall through */
 153                 }
 154                 if (pmd_none_or_clear_bad(pmd))
 155                         continue;
 156                 pages += change_pte_range(vma, pmd, addr, next, newprot,
 157                                  dirty_accountable, prot_numa, &all_same_node);
 158
 159                 /*
 160                  * If we are changing protections for NUMA hinting faults then
 161                  * set pmd_numa if the examined pages were all on the same
 162                  * node. This allows a regular PMD to be handled as one fault
 163                  * and effectively batches the taking of the PTL
 164                  */
 165                 if (prot_numa && all_same_node)
 166                         change_pmd_protnuma(vma->vm_mm, addr, pmd);
 167         } while (pmd++, addr = next, addr != end);
 168
 169         return pages;
 170 }
 171
 172 static inline unsigned long change_pud_range(struct vm_area_struct *vma,
 173                 pgd_t *pgd, unsigned long addr, unsigned long end,
 174                 pgprot_t newprot, int dirty_accountable, int prot_numa)
 175 {
 176         pud_t *pud;
 177         unsigned long next;
 178         unsigned long pages = 0;
 179
 180         pud = pud_offset(pgd, addr);
 181         do {
 182                 next = pud_addr_end(addr, end);
 183                 if (pud_none_or_clear_bad(pud))
 184                         continue;
 185                 pages += change_pmd_range(vma, pud, addr, next, newprot,
 186                                  dirty_accountable, prot_numa);
 187         } while (pud++, addr = next, addr != end);
 188
 189         return pages;
 190 }
 191
 192 static unsigned long change_protection_range(struct vm_area_struct *vma,
 193                 unsigned long addr, unsigned long end, pgprot_t newprot,
 194                 int dirty_accountable, int prot_numa)
 195 {
 196         struct mm_struct *mm = vma->vm_mm;
 197         pgd_t *pgd;
 198         unsigned long next;
 199         unsigned long start = addr;
 200         unsigned long pages = 0;
 201
 202         BUG_ON(addr >= end);
 203         pgd = pgd_offset(mm, addr);
 204         flush_cache_range(vma, addr, end);
 205         do {
 206                 next = pgd_addr_end(addr, end);
 207                 if (pgd_none_or_clear_bad(pgd))
 208                         continue;
 209                 pages += change_pud_range(vma, pgd, addr, next, newprot,
 210                                  dirty_accountable, prot_numa);
 211         } while (pgd++, addr = next, addr != end);
 212
 213         /* Only flush the TLB if we actually modified any entries: */
 214         if (pages)
 215                 flush_tlb_range(vma, start, end);
 216
 217         return pages;
 218 }
 219
 220 unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
 221                        unsigned long end, pgprot_t newprot,
 222                        int dirty_accountable, int prot_numa)
 223 {
 224         struct mm_struct *mm = vma->vm_mm;
 225         unsigned long pages;
 226
 227         mmu_notifier_invalidate_range_start(mm, start, end);
 228         if (is_vm_hugetlb_page(vma))
 229                 pages = hugetlb_change_protection(vma, start, end, newprot);
 230         else
 231                 pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
 232         mmu_notifier_invalidate_range_end(mm, start, end);
 233
 234         return pages;
 235 }
 236
 237 int
 238 mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 239         unsigned long start, unsigned long end, unsigned long newflags)
 240 {
 241         struct mm_struct *mm = vma->vm_mm;
 242         unsigned long oldflags = vma->vm_flags;
 243         long nrpages = (end - start) >> PAGE_SHIFT;
 244         unsigned long charged = 0;
 245         pgoff_t pgoff;
 246         int error;
 247         int dirty_accountable = 0;
 248
 249         if (newflags == oldflags) {
 250                 *pprev = vma;
 251                 return 0;
 252         }
 253
 254         /*
 255          * If we make a private mapping writable we increase our commit;
 256          * but (without finer accounting) cannot reduce our commit if we
 257          * make it unwritable again. hugetlb mapping were accounted for
 258          * even if read-only so there is no need to account for them here
 259          */
 260         if (newflags & VM_WRITE) {
 261                 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
 262                                                 VM_SHARED|VM_NORESERVE))) {
 263                         charged = nrpages;
 264                         if (security_vm_enough_memory_mm(mm, charged))
 265                                 return -ENOMEM;
 266                         newflags |= VM_ACCOUNT;
 267                 }
 268         }
 269
 270         /*
 271          * First try to merge with previous and/or next vma.
 272          */
 273         pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 274         *pprev = vma_merge(mm, *pprev, start, end, newflags,
 275                         vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
 276         if (*pprev) {
 277                 vma = *pprev;
 278                 goto success;
 279         }
 280
 281         *pprev = vma;
 282
 283         if (start != vma->vm_start) {
 284                 error = split_vma(mm, vma, start, 1);
 285                 if (error)
 286                         goto fail;
 287         }
 288
 289         if (end != vma->vm_end) {
 290                 error = split_vma(mm, vma, end, 0);
 291                 if (error)
 292                         goto fail;
 293         }
 294
 295 success:
 296         /*
 297          * vm_flags and vm_page_prot are protected by the mmap_sem
 298          * held in write mode.
 299          */
 300         vma->vm_flags = newflags;
 301         vma->vm_page_prot = pgprot_modify(vma->vm_page_prot,
 302                                           vm_get_page_prot(newflags));
 303
 304         if (vma_wants_writenotify(vma)) {
 305                 vma->vm_page_prot = vm_get_page_prot(newflags & ~VM_SHARED);
 306                 dirty_accountable = 1;
 307         }
 308
 309         change_protection(vma, start, end, vma->vm_page_prot,
 310                           dirty_accountable, 0);
 311
 312         vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
 313         vm_stat_account(mm, newflags, vma->vm_file, nrpages);
 314         perf_event_mmap(vma);
 315         return 0;
 316
 317 fail:
 318         vm_unacct_memory(charged);
 319         return error;
 320 }
 321
 322 SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
 323                 unsigned long, prot)
 324 {
 325         unsigned long vm_flags, nstart, end, tmp, reqprot;
 326         struct vm_area_struct *vma, *prev;
 327         int error = -EINVAL;
 328         const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
 329         prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
 330         if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
 331                 return -EINVAL;
 332
 333         if (start & ~PAGE_MASK)
 334                 return -EINVAL;
 335         if (!len)
 336                 return 0;
 337         len = PAGE_ALIGN(len);
 338         end = start + len;
 339         if (end <= start)
 340                 return -ENOMEM;
 341         if (!arch_validate_prot(prot))
 342                 return -EINVAL;
 343
 344         reqprot = prot;
 345         /*
 346          * Does the application expect PROT_READ to imply PROT_EXEC:
 347          */
 348         if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
 349                 prot |= PROT_EXEC;
 350
 351         vm_flags = calc_vm_prot_bits(prot);
 352
 353         down_write(&current->mm->mmap_sem);
 354
 355         vma = find_vma(current->mm, start);
 356         error = -ENOMEM;
 357         if (!vma)
 358                 goto out;
 359         prev = vma->vm_prev;
 360         if (unlikely(grows & PROT_GROWSDOWN)) {
 361                 if (vma->vm_start >= end)
 362                         goto out;
 363                 start = vma->vm_start;
 364                 error = -EINVAL;
 365                 if (!(vma->vm_flags & VM_GROWSDOWN))
 366                         goto out;
 367         } else {
 368                 if (vma->vm_start > start)
 369                         goto out;
 370                 if (unlikely(grows & PROT_GROWSUP)) {
 371                         end = vma->vm_end;
 372                         error = -EINVAL;
 373                         if (!(vma->vm_flags & VM_GROWSUP))
 374                                 goto out;
 375                 }
 376         }
 377         if (start > vma->vm_start)
 378                 prev = vma;
 379
 380         for (nstart = start ; ; ) {
 381                 unsigned long newflags;
 382
 383                 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
 384
 385                 newflags = vm_flags;
 386                 newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
 387
 388                 /* newflags >> 4 shift VM_MAY% in place of VM_% */
 389                 if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
 390                         error = -EACCES;
 391                         goto out;
 392                 }
 393
 394                 error = security_file_mprotect(vma, reqprot, prot);
 395                 if (error)
 396                         goto out;
 397
 398                 tmp = vma->vm_end;
 399                 if (tmp > end)
 400                         tmp = end;
 401                 error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
 402                 if (error)
 403                         goto out;
 404                 nstart = tmp;
 405
 406                 if (nstart < prev->vm_end)
 407                         nstart = prev->vm_end;
 408                 if (nstart >= end)
 409                         goto out;
 410
 411                 vma = prev->vm_next;
 412                 if (!vma || vma->vm_start != nstart) {
 413                         error = -ENOMEM;
 414                         goto out;
 415                 }
 416         }
 417 out:
 418         up_write(&current->mm->mmap_sem);
 419         return error;
 420 }