]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blob - arch/s390/mm/pgalloc.c
Merge tag 'scsi-misc' of git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi
[mirror_ubuntu-hirsute-kernel.git] / arch / s390 / mm / pgalloc.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Page table allocation functions
4 *
5 * Copyright IBM Corp. 2016
6 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
7 */
8
9 #include <linux/sysctl.h>
10 #include <linux/slab.h>
11 #include <linux/mm.h>
12 #include <asm/mmu_context.h>
13 #include <asm/pgalloc.h>
14 #include <asm/gmap.h>
15 #include <asm/tlb.h>
16 #include <asm/tlbflush.h>
17
18 #ifdef CONFIG_PGSTE
19
20 static int page_table_allocate_pgste_min = 0;
21 static int page_table_allocate_pgste_max = 1;
22 int page_table_allocate_pgste = 0;
23 EXPORT_SYMBOL(page_table_allocate_pgste);
24
25 static struct ctl_table page_table_sysctl[] = {
26 {
27 .procname = "allocate_pgste",
28 .data = &page_table_allocate_pgste,
29 .maxlen = sizeof(int),
30 .mode = S_IRUGO | S_IWUSR,
31 .proc_handler = proc_dointvec_minmax,
32 .extra1 = &page_table_allocate_pgste_min,
33 .extra2 = &page_table_allocate_pgste_max,
34 },
35 { }
36 };
37
38 static struct ctl_table page_table_sysctl_dir[] = {
39 {
40 .procname = "vm",
41 .maxlen = 0,
42 .mode = 0555,
43 .child = page_table_sysctl,
44 },
45 { }
46 };
47
48 static int __init page_table_register_sysctl(void)
49 {
50 return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM;
51 }
52 __initcall(page_table_register_sysctl);
53
54 #endif /* CONFIG_PGSTE */
55
56 unsigned long *crst_table_alloc(struct mm_struct *mm)
57 {
58 struct page *page = alloc_pages(GFP_KERNEL, 2);
59
60 if (!page)
61 return NULL;
62 arch_set_page_dat(page, 2);
63 return (unsigned long *) page_to_phys(page);
64 }
65
66 void crst_table_free(struct mm_struct *mm, unsigned long *table)
67 {
68 free_pages((unsigned long) table, 2);
69 }
70
71 static void __crst_table_upgrade(void *arg)
72 {
73 struct mm_struct *mm = arg;
74
75 if (current->active_mm == mm)
76 set_user_asce(mm);
77 __tlb_flush_local();
78 }
79
80 int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
81 {
82 unsigned long *table, *pgd;
83 int rc, notify;
84
85 /* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */
86 VM_BUG_ON(mm->context.asce_limit < _REGION2_SIZE);
87 rc = 0;
88 notify = 0;
89 while (mm->context.asce_limit < end) {
90 table = crst_table_alloc(mm);
91 if (!table) {
92 rc = -ENOMEM;
93 break;
94 }
95 spin_lock_bh(&mm->page_table_lock);
96 pgd = (unsigned long *) mm->pgd;
97 if (mm->context.asce_limit == _REGION2_SIZE) {
98 crst_table_init(table, _REGION2_ENTRY_EMPTY);
99 p4d_populate(mm, (p4d_t *) table, (pud_t *) pgd);
100 mm->pgd = (pgd_t *) table;
101 mm->context.asce_limit = _REGION1_SIZE;
102 mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
103 _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
104 mm_inc_nr_puds(mm);
105 } else {
106 crst_table_init(table, _REGION1_ENTRY_EMPTY);
107 pgd_populate(mm, (pgd_t *) table, (p4d_t *) pgd);
108 mm->pgd = (pgd_t *) table;
109 mm->context.asce_limit = -PAGE_SIZE;
110 mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
111 _ASCE_USER_BITS | _ASCE_TYPE_REGION1;
112 }
113 notify = 1;
114 spin_unlock_bh(&mm->page_table_lock);
115 }
116 if (notify)
117 on_each_cpu(__crst_table_upgrade, mm, 0);
118 return rc;
119 }
120
121 void crst_table_downgrade(struct mm_struct *mm)
122 {
123 pgd_t *pgd;
124
125 /* downgrade should only happen from 3 to 2 levels (compat only) */
126 VM_BUG_ON(mm->context.asce_limit != _REGION2_SIZE);
127
128 if (current->active_mm == mm) {
129 clear_user_asce();
130 __tlb_flush_mm(mm);
131 }
132
133 pgd = mm->pgd;
134 mm_dec_nr_pmds(mm);
135 mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
136 mm->context.asce_limit = _REGION3_SIZE;
137 mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
138 _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
139 crst_table_free(mm, (unsigned long *) pgd);
140
141 if (current->active_mm == mm)
142 set_user_asce(mm);
143 }
144
145 static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
146 {
147 unsigned int old, new;
148
149 do {
150 old = atomic_read(v);
151 new = old ^ bits;
152 } while (atomic_cmpxchg(v, old, new) != old);
153 return new;
154 }
155
156 #ifdef CONFIG_PGSTE
157
158 struct page *page_table_alloc_pgste(struct mm_struct *mm)
159 {
160 struct page *page;
161 u64 *table;
162
163 page = alloc_page(GFP_KERNEL);
164 if (page) {
165 table = (u64 *)page_to_phys(page);
166 memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
167 memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
168 }
169 return page;
170 }
171
172 void page_table_free_pgste(struct page *page)
173 {
174 __free_page(page);
175 }
176
177 #endif /* CONFIG_PGSTE */
178
179 /*
180 * page table entry allocation/free routines.
181 */
182 unsigned long *page_table_alloc(struct mm_struct *mm)
183 {
184 unsigned long *table;
185 struct page *page;
186 unsigned int mask, bit;
187
188 /* Try to get a fragment of a 4K page as a 2K page table */
189 if (!mm_alloc_pgste(mm)) {
190 table = NULL;
191 spin_lock_bh(&mm->context.lock);
192 if (!list_empty(&mm->context.pgtable_list)) {
193 page = list_first_entry(&mm->context.pgtable_list,
194 struct page, lru);
195 mask = atomic_read(&page->_refcount) >> 24;
196 mask = (mask | (mask >> 4)) & 3;
197 if (mask != 3) {
198 table = (unsigned long *) page_to_phys(page);
199 bit = mask & 1; /* =1 -> second 2K */
200 if (bit)
201 table += PTRS_PER_PTE;
202 atomic_xor_bits(&page->_refcount,
203 1U << (bit + 24));
204 list_del(&page->lru);
205 }
206 }
207 spin_unlock_bh(&mm->context.lock);
208 if (table)
209 return table;
210 }
211 /* Allocate a fresh page */
212 page = alloc_page(GFP_KERNEL);
213 if (!page)
214 return NULL;
215 if (!pgtable_page_ctor(page)) {
216 __free_page(page);
217 return NULL;
218 }
219 arch_set_page_dat(page, 0);
220 /* Initialize page table */
221 table = (unsigned long *) page_to_phys(page);
222 if (mm_alloc_pgste(mm)) {
223 /* Return 4K page table with PGSTEs */
224 atomic_xor_bits(&page->_refcount, 3 << 24);
225 memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
226 memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
227 } else {
228 /* Return the first 2K fragment of the page */
229 atomic_xor_bits(&page->_refcount, 1 << 24);
230 memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
231 spin_lock_bh(&mm->context.lock);
232 list_add(&page->lru, &mm->context.pgtable_list);
233 spin_unlock_bh(&mm->context.lock);
234 }
235 return table;
236 }
237
238 void page_table_free(struct mm_struct *mm, unsigned long *table)
239 {
240 struct page *page;
241 unsigned int bit, mask;
242
243 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
244 if (!mm_alloc_pgste(mm)) {
245 /* Free 2K page table fragment of a 4K page */
246 bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
247 spin_lock_bh(&mm->context.lock);
248 mask = atomic_xor_bits(&page->_refcount, 1U << (bit + 24));
249 mask >>= 24;
250 if (mask & 3)
251 list_add(&page->lru, &mm->context.pgtable_list);
252 else
253 list_del(&page->lru);
254 spin_unlock_bh(&mm->context.lock);
255 if (mask != 0)
256 return;
257 } else {
258 atomic_xor_bits(&page->_refcount, 3U << 24);
259 }
260
261 pgtable_page_dtor(page);
262 __free_page(page);
263 }
264
265 void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
266 unsigned long vmaddr)
267 {
268 struct mm_struct *mm;
269 struct page *page;
270 unsigned int bit, mask;
271
272 mm = tlb->mm;
273 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
274 if (mm_alloc_pgste(mm)) {
275 gmap_unlink(mm, table, vmaddr);
276 table = (unsigned long *) (__pa(table) | 3);
277 tlb_remove_table(tlb, table);
278 return;
279 }
280 bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
281 spin_lock_bh(&mm->context.lock);
282 mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
283 mask >>= 24;
284 if (mask & 3)
285 list_add_tail(&page->lru, &mm->context.pgtable_list);
286 else
287 list_del(&page->lru);
288 spin_unlock_bh(&mm->context.lock);
289 table = (unsigned long *) (__pa(table) | (1U << bit));
290 tlb_remove_table(tlb, table);
291 }
292
293 void __tlb_remove_table(void *_table)
294 {
295 unsigned int mask = (unsigned long) _table & 3;
296 void *table = (void *)((unsigned long) _table ^ mask);
297 struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
298
299 switch (mask) {
300 case 0: /* pmd, pud, or p4d */
301 free_pages((unsigned long) table, 2);
302 break;
303 case 1: /* lower 2K of a 4K page table */
304 case 2: /* higher 2K of a 4K page table */
305 mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24));
306 mask >>= 24;
307 if (mask != 0)
308 break;
309 /* fallthrough */
310 case 3: /* 4K page table with pgstes */
311 if (mask & 3)
312 atomic_xor_bits(&page->_refcount, 3 << 24);
313 pgtable_page_dtor(page);
314 __free_page(page);
315 break;
316 }
317 }
318
319 /*
320 * Base infrastructure required to generate basic asces, region, segment,
321 * and page tables that do not make use of enhanced features like EDAT1.
322 */
323
324 static struct kmem_cache *base_pgt_cache;
325
326 static unsigned long base_pgt_alloc(void)
327 {
328 u64 *table;
329
330 table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL);
331 if (table)
332 memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
333 return (unsigned long) table;
334 }
335
336 static void base_pgt_free(unsigned long table)
337 {
338 kmem_cache_free(base_pgt_cache, (void *) table);
339 }
340
341 static unsigned long base_crst_alloc(unsigned long val)
342 {
343 unsigned long table;
344
345 table = __get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
346 if (table)
347 crst_table_init((unsigned long *)table, val);
348 return table;
349 }
350
351 static void base_crst_free(unsigned long table)
352 {
353 free_pages(table, CRST_ALLOC_ORDER);
354 }
355
356 #define BASE_ADDR_END_FUNC(NAME, SIZE) \
357 static inline unsigned long base_##NAME##_addr_end(unsigned long addr, \
358 unsigned long end) \
359 { \
360 unsigned long next = (addr + (SIZE)) & ~((SIZE) - 1); \
361 \
362 return (next - 1) < (end - 1) ? next : end; \
363 }
364
365 BASE_ADDR_END_FUNC(page, _PAGE_SIZE)
366 BASE_ADDR_END_FUNC(segment, _SEGMENT_SIZE)
367 BASE_ADDR_END_FUNC(region3, _REGION3_SIZE)
368 BASE_ADDR_END_FUNC(region2, _REGION2_SIZE)
369 BASE_ADDR_END_FUNC(region1, _REGION1_SIZE)
370
371 static inline unsigned long base_lra(unsigned long address)
372 {
373 unsigned long real;
374
375 asm volatile(
376 " lra %0,0(%1)\n"
377 : "=d" (real) : "a" (address) : "cc");
378 return real;
379 }
380
381 static int base_page_walk(unsigned long origin, unsigned long addr,
382 unsigned long end, int alloc)
383 {
384 unsigned long *pte, next;
385
386 if (!alloc)
387 return 0;
388 pte = (unsigned long *) origin;
389 pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT;
390 do {
391 next = base_page_addr_end(addr, end);
392 *pte = base_lra(addr);
393 } while (pte++, addr = next, addr < end);
394 return 0;
395 }
396
397 static int base_segment_walk(unsigned long origin, unsigned long addr,
398 unsigned long end, int alloc)
399 {
400 unsigned long *ste, next, table;
401 int rc;
402
403 ste = (unsigned long *) origin;
404 ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
405 do {
406 next = base_segment_addr_end(addr, end);
407 if (*ste & _SEGMENT_ENTRY_INVALID) {
408 if (!alloc)
409 continue;
410 table = base_pgt_alloc();
411 if (!table)
412 return -ENOMEM;
413 *ste = table | _SEGMENT_ENTRY;
414 }
415 table = *ste & _SEGMENT_ENTRY_ORIGIN;
416 rc = base_page_walk(table, addr, next, alloc);
417 if (rc)
418 return rc;
419 if (!alloc)
420 base_pgt_free(table);
421 cond_resched();
422 } while (ste++, addr = next, addr < end);
423 return 0;
424 }
425
426 static int base_region3_walk(unsigned long origin, unsigned long addr,
427 unsigned long end, int alloc)
428 {
429 unsigned long *rtte, next, table;
430 int rc;
431
432 rtte = (unsigned long *) origin;
433 rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT;
434 do {
435 next = base_region3_addr_end(addr, end);
436 if (*rtte & _REGION_ENTRY_INVALID) {
437 if (!alloc)
438 continue;
439 table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
440 if (!table)
441 return -ENOMEM;
442 *rtte = table | _REGION3_ENTRY;
443 }
444 table = *rtte & _REGION_ENTRY_ORIGIN;
445 rc = base_segment_walk(table, addr, next, alloc);
446 if (rc)
447 return rc;
448 if (!alloc)
449 base_crst_free(table);
450 } while (rtte++, addr = next, addr < end);
451 return 0;
452 }
453
454 static int base_region2_walk(unsigned long origin, unsigned long addr,
455 unsigned long end, int alloc)
456 {
457 unsigned long *rste, next, table;
458 int rc;
459
460 rste = (unsigned long *) origin;
461 rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT;
462 do {
463 next = base_region2_addr_end(addr, end);
464 if (*rste & _REGION_ENTRY_INVALID) {
465 if (!alloc)
466 continue;
467 table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
468 if (!table)
469 return -ENOMEM;
470 *rste = table | _REGION2_ENTRY;
471 }
472 table = *rste & _REGION_ENTRY_ORIGIN;
473 rc = base_region3_walk(table, addr, next, alloc);
474 if (rc)
475 return rc;
476 if (!alloc)
477 base_crst_free(table);
478 } while (rste++, addr = next, addr < end);
479 return 0;
480 }
481
482 static int base_region1_walk(unsigned long origin, unsigned long addr,
483 unsigned long end, int alloc)
484 {
485 unsigned long *rfte, next, table;
486 int rc;
487
488 rfte = (unsigned long *) origin;
489 rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT;
490 do {
491 next = base_region1_addr_end(addr, end);
492 if (*rfte & _REGION_ENTRY_INVALID) {
493 if (!alloc)
494 continue;
495 table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
496 if (!table)
497 return -ENOMEM;
498 *rfte = table | _REGION1_ENTRY;
499 }
500 table = *rfte & _REGION_ENTRY_ORIGIN;
501 rc = base_region2_walk(table, addr, next, alloc);
502 if (rc)
503 return rc;
504 if (!alloc)
505 base_crst_free(table);
506 } while (rfte++, addr = next, addr < end);
507 return 0;
508 }
509
510 /**
511 * base_asce_free - free asce and tables returned from base_asce_alloc()
512 * @asce: asce to be freed
513 *
514 * Frees all region, segment, and page tables that were allocated with a
515 * corresponding base_asce_alloc() call.
516 */
517 void base_asce_free(unsigned long asce)
518 {
519 unsigned long table = asce & _ASCE_ORIGIN;
520
521 if (!asce)
522 return;
523 switch (asce & _ASCE_TYPE_MASK) {
524 case _ASCE_TYPE_SEGMENT:
525 base_segment_walk(table, 0, _REGION3_SIZE, 0);
526 break;
527 case _ASCE_TYPE_REGION3:
528 base_region3_walk(table, 0, _REGION2_SIZE, 0);
529 break;
530 case _ASCE_TYPE_REGION2:
531 base_region2_walk(table, 0, _REGION1_SIZE, 0);
532 break;
533 case _ASCE_TYPE_REGION1:
534 base_region1_walk(table, 0, -_PAGE_SIZE, 0);
535 break;
536 }
537 base_crst_free(table);
538 }
539
540 static int base_pgt_cache_init(void)
541 {
542 static DEFINE_MUTEX(base_pgt_cache_mutex);
543 unsigned long sz = _PAGE_TABLE_SIZE;
544
545 if (base_pgt_cache)
546 return 0;
547 mutex_lock(&base_pgt_cache_mutex);
548 if (!base_pgt_cache)
549 base_pgt_cache = kmem_cache_create("base_pgt", sz, sz, 0, NULL);
550 mutex_unlock(&base_pgt_cache_mutex);
551 return base_pgt_cache ? 0 : -ENOMEM;
552 }
553
554 /**
555 * base_asce_alloc - create kernel mapping without enhanced DAT features
556 * @addr: virtual start address of kernel mapping
557 * @num_pages: number of consecutive pages
558 *
559 * Generate an asce, including all required region, segment and page tables,
560 * that can be used to access the virtual kernel mapping. The difference is
561 * that the returned asce does not make use of any enhanced DAT features like
562 * e.g. large pages. This is required for some I/O functions that pass an
563 * asce, like e.g. some service call requests.
564 *
565 * Note: the returned asce may NEVER be attached to any cpu. It may only be
566 * used for I/O requests. tlb entries that might result because the
567 * asce was attached to a cpu won't be cleared.
568 */
569 unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)
570 {
571 unsigned long asce, table, end;
572 int rc;
573
574 if (base_pgt_cache_init())
575 return 0;
576 end = addr + num_pages * PAGE_SIZE;
577 if (end <= _REGION3_SIZE) {
578 table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
579 if (!table)
580 return 0;
581 rc = base_segment_walk(table, addr, end, 1);
582 asce = table | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;
583 } else if (end <= _REGION2_SIZE) {
584 table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
585 if (!table)
586 return 0;
587 rc = base_region3_walk(table, addr, end, 1);
588 asce = table | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
589 } else if (end <= _REGION1_SIZE) {
590 table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
591 if (!table)
592 return 0;
593 rc = base_region2_walk(table, addr, end, 1);
594 asce = table | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
595 } else {
596 table = base_crst_alloc(_REGION1_ENTRY_EMPTY);
597 if (!table)
598 return 0;
599 rc = base_region1_walk(table, addr, end, 1);
600 asce = table | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;
601 }
602 if (rc) {
603 base_asce_free(asce);
604 asce = 0;
605 }
606 return asce;
607 }