]>
Commit | Line | Data |
---|---|---|
b5eafe92 JF |
1 | /* |
2 | * Xen leaves the responsibility for maintaining p2m mappings to the | |
3 | * guests themselves, but it must also access and update the p2m array | |
4 | * during suspend/resume when all the pages are reallocated. | |
5 | * | |
6 | * The p2m table is logically a flat array, but we implement it as a | |
7 | * three-level tree to allow the address space to be sparse. | |
8 | * | |
9 | * Xen | |
10 | * | | |
11 | * p2m_top p2m_top_mfn | |
12 | * / \ / \ | |
13 | * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn | |
14 | * / \ / \ / / | |
15 | * p2m p2m p2m p2m p2m p2m p2m ... | |
16 | * | |
17 | * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p. | |
18 | * | |
19 | * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the | |
20 | * maximum representable pseudo-physical address space is: | |
21 | * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages | |
22 | * | |
23 | * P2M_PER_PAGE depends on the architecture, as a mfn is always | |
24 | * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to | |
25 | * 512 and 1024 entries respectively. | |
26 | */ | |
27 | ||
28 | #include <linux/init.h> | |
29 | #include <linux/module.h> | |
448f2831 JF |
30 | #include <linux/list.h> |
31 | #include <linux/hash.h> | |
87f1d40a | 32 | #include <linux/sched.h> |
b5eafe92 JF |
33 | |
34 | #include <asm/cache.h> | |
35 | #include <asm/setup.h> | |
36 | ||
37 | #include <asm/xen/page.h> | |
38 | #include <asm/xen/hypercall.h> | |
39 | #include <asm/xen/hypervisor.h> | |
40 | ||
41 | #include "xen-ops.h" | |
42 | ||
448f2831 JF |
43 | static void __init m2p_override_init(void); |
44 | ||
b5eafe92 JF |
45 | unsigned long xen_max_p2m_pfn __read_mostly; |
46 | ||
47 | #define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) | |
48 | #define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) | |
49 | #define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) | |
50 | ||
51 | #define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) | |
52 | ||
53 | /* Placeholders for holes in the address space */ | |
54 | static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); | |
55 | static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); | |
56 | static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE); | |
57 | ||
58 | static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); | |
59 | static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); | |
60 | static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); | |
61 | ||
62 | RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); | |
63 | RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); | |
64 | ||
65 | static inline unsigned p2m_top_index(unsigned long pfn) | |
66 | { | |
67 | BUG_ON(pfn >= MAX_P2M_PFN); | |
68 | return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE); | |
69 | } | |
70 | ||
71 | static inline unsigned p2m_mid_index(unsigned long pfn) | |
72 | { | |
73 | return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE; | |
74 | } | |
75 | ||
76 | static inline unsigned p2m_index(unsigned long pfn) | |
77 | { | |
78 | return pfn % P2M_PER_PAGE; | |
79 | } | |
80 | ||
81 | static void p2m_top_init(unsigned long ***top) | |
82 | { | |
83 | unsigned i; | |
84 | ||
85 | for (i = 0; i < P2M_TOP_PER_PAGE; i++) | |
86 | top[i] = p2m_mid_missing; | |
87 | } | |
88 | ||
89 | static void p2m_top_mfn_init(unsigned long *top) | |
90 | { | |
91 | unsigned i; | |
92 | ||
93 | for (i = 0; i < P2M_TOP_PER_PAGE; i++) | |
94 | top[i] = virt_to_mfn(p2m_mid_missing_mfn); | |
95 | } | |
96 | ||
97 | static void p2m_top_mfn_p_init(unsigned long **top) | |
98 | { | |
99 | unsigned i; | |
100 | ||
101 | for (i = 0; i < P2M_TOP_PER_PAGE; i++) | |
102 | top[i] = p2m_mid_missing_mfn; | |
103 | } | |
104 | ||
105 | static void p2m_mid_init(unsigned long **mid) | |
106 | { | |
107 | unsigned i; | |
108 | ||
109 | for (i = 0; i < P2M_MID_PER_PAGE; i++) | |
110 | mid[i] = p2m_missing; | |
111 | } | |
112 | ||
113 | static void p2m_mid_mfn_init(unsigned long *mid) | |
114 | { | |
115 | unsigned i; | |
116 | ||
117 | for (i = 0; i < P2M_MID_PER_PAGE; i++) | |
118 | mid[i] = virt_to_mfn(p2m_missing); | |
119 | } | |
120 | ||
121 | static void p2m_init(unsigned long *p2m) | |
122 | { | |
123 | unsigned i; | |
124 | ||
125 | for (i = 0; i < P2M_MID_PER_PAGE; i++) | |
126 | p2m[i] = INVALID_P2M_ENTRY; | |
127 | } | |
128 | ||
129 | /* | |
130 | * Build the parallel p2m_top_mfn and p2m_mid_mfn structures | |
131 | * | |
132 | * This is called both at boot time, and after resuming from suspend: | |
133 | * - At boot time we're called very early, and must use extend_brk() | |
134 | * to allocate memory. | |
135 | * | |
136 | * - After resume we're called from within stop_machine, but the mfn | |
137 | * tree should alreay be completely allocated. | |
138 | */ | |
139 | void xen_build_mfn_list_list(void) | |
140 | { | |
141 | unsigned long pfn; | |
142 | ||
143 | /* Pre-initialize p2m_top_mfn to be completely missing */ | |
144 | if (p2m_top_mfn == NULL) { | |
145 | p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); | |
146 | p2m_mid_mfn_init(p2m_mid_missing_mfn); | |
147 | ||
148 | p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); | |
149 | p2m_top_mfn_p_init(p2m_top_mfn_p); | |
150 | ||
151 | p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); | |
152 | p2m_top_mfn_init(p2m_top_mfn); | |
153 | } else { | |
154 | /* Reinitialise, mfn's all change after migration */ | |
155 | p2m_mid_mfn_init(p2m_mid_missing_mfn); | |
156 | } | |
157 | ||
158 | for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { | |
159 | unsigned topidx = p2m_top_index(pfn); | |
160 | unsigned mididx = p2m_mid_index(pfn); | |
161 | unsigned long **mid; | |
162 | unsigned long *mid_mfn_p; | |
163 | ||
164 | mid = p2m_top[topidx]; | |
165 | mid_mfn_p = p2m_top_mfn_p[topidx]; | |
166 | ||
167 | /* Don't bother allocating any mfn mid levels if | |
168 | * they're just missing, just update the stored mfn, | |
169 | * since all could have changed over a migrate. | |
170 | */ | |
171 | if (mid == p2m_mid_missing) { | |
172 | BUG_ON(mididx); | |
173 | BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); | |
174 | p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn); | |
175 | pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE; | |
176 | continue; | |
177 | } | |
178 | ||
179 | if (mid_mfn_p == p2m_mid_missing_mfn) { | |
180 | /* | |
181 | * XXX boot-time only! We should never find | |
182 | * missing parts of the mfn tree after | |
183 | * runtime. extend_brk() will BUG if we call | |
184 | * it too late. | |
185 | */ | |
186 | mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); | |
187 | p2m_mid_mfn_init(mid_mfn_p); | |
188 | ||
189 | p2m_top_mfn_p[topidx] = mid_mfn_p; | |
190 | } | |
191 | ||
192 | p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); | |
193 | mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]); | |
194 | } | |
195 | } | |
196 | ||
197 | void xen_setup_mfn_list_list(void) | |
198 | { | |
199 | BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); | |
200 | ||
201 | HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = | |
202 | virt_to_mfn(p2m_top_mfn); | |
203 | HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; | |
204 | } | |
205 | ||
206 | /* Set up p2m_top to point to the domain-builder provided p2m pages */ | |
207 | void __init xen_build_dynamic_phys_to_machine(void) | |
208 | { | |
209 | unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; | |
210 | unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); | |
211 | unsigned long pfn; | |
212 | ||
213 | xen_max_p2m_pfn = max_pfn; | |
214 | ||
215 | p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); | |
216 | p2m_init(p2m_missing); | |
217 | ||
218 | p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); | |
219 | p2m_mid_init(p2m_mid_missing); | |
220 | ||
221 | p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); | |
222 | p2m_top_init(p2m_top); | |
223 | ||
224 | /* | |
225 | * The domain builder gives us a pre-constructed p2m array in | |
226 | * mfn_list for all the pages initially given to us, so we just | |
227 | * need to graft that into our tree structure. | |
228 | */ | |
229 | for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) { | |
230 | unsigned topidx = p2m_top_index(pfn); | |
231 | unsigned mididx = p2m_mid_index(pfn); | |
232 | ||
233 | if (p2m_top[topidx] == p2m_mid_missing) { | |
234 | unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); | |
235 | p2m_mid_init(mid); | |
236 | ||
237 | p2m_top[topidx] = mid; | |
238 | } | |
239 | ||
8e1b4cf2 SB |
240 | /* |
241 | * As long as the mfn_list has enough entries to completely | |
242 | * fill a p2m page, pointing into the array is ok. But if | |
243 | * not the entries beyond the last pfn will be undefined. | |
8e1b4cf2 SB |
244 | */ |
245 | if (unlikely(pfn + P2M_PER_PAGE > max_pfn)) { | |
246 | unsigned long p2midx; | |
cf04d120 SB |
247 | |
248 | p2midx = max_pfn % P2M_PER_PAGE; | |
249 | for ( ; p2midx < P2M_PER_PAGE; p2midx++) | |
250 | mfn_list[pfn + p2midx] = INVALID_P2M_ENTRY; | |
251 | } | |
252 | p2m_top[topidx][mididx] = &mfn_list[pfn]; | |
b5eafe92 | 253 | } |
448f2831 JF |
254 | |
255 | m2p_override_init(); | |
b5eafe92 JF |
256 | } |
257 | ||
258 | unsigned long get_phys_to_machine(unsigned long pfn) | |
259 | { | |
260 | unsigned topidx, mididx, idx; | |
261 | ||
262 | if (unlikely(pfn >= MAX_P2M_PFN)) | |
263 | return INVALID_P2M_ENTRY; | |
264 | ||
265 | topidx = p2m_top_index(pfn); | |
266 | mididx = p2m_mid_index(pfn); | |
267 | idx = p2m_index(pfn); | |
268 | ||
269 | return p2m_top[topidx][mididx][idx]; | |
270 | } | |
271 | EXPORT_SYMBOL_GPL(get_phys_to_machine); | |
272 | ||
273 | static void *alloc_p2m_page(void) | |
274 | { | |
275 | return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); | |
276 | } | |
277 | ||
278 | static void free_p2m_page(void *p) | |
279 | { | |
280 | free_page((unsigned long)p); | |
281 | } | |
282 | ||
283 | /* | |
284 | * Fully allocate the p2m structure for a given pfn. We need to check | |
285 | * that both the top and mid levels are allocated, and make sure the | |
286 | * parallel mfn tree is kept in sync. We may race with other cpus, so | |
287 | * the new pages are installed with cmpxchg; if we lose the race then | |
288 | * simply free the page we allocated and use the one that's there. | |
289 | */ | |
290 | static bool alloc_p2m(unsigned long pfn) | |
291 | { | |
292 | unsigned topidx, mididx; | |
293 | unsigned long ***top_p, **mid; | |
294 | unsigned long *top_mfn_p, *mid_mfn; | |
295 | ||
296 | topidx = p2m_top_index(pfn); | |
297 | mididx = p2m_mid_index(pfn); | |
298 | ||
299 | top_p = &p2m_top[topidx]; | |
300 | mid = *top_p; | |
301 | ||
302 | if (mid == p2m_mid_missing) { | |
303 | /* Mid level is missing, allocate a new one */ | |
304 | mid = alloc_p2m_page(); | |
305 | if (!mid) | |
306 | return false; | |
307 | ||
308 | p2m_mid_init(mid); | |
309 | ||
310 | if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing) | |
311 | free_p2m_page(mid); | |
312 | } | |
313 | ||
314 | top_mfn_p = &p2m_top_mfn[topidx]; | |
315 | mid_mfn = p2m_top_mfn_p[topidx]; | |
316 | ||
317 | BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); | |
318 | ||
319 | if (mid_mfn == p2m_mid_missing_mfn) { | |
320 | /* Separately check the mid mfn level */ | |
321 | unsigned long missing_mfn; | |
322 | unsigned long mid_mfn_mfn; | |
323 | ||
324 | mid_mfn = alloc_p2m_page(); | |
325 | if (!mid_mfn) | |
326 | return false; | |
327 | ||
328 | p2m_mid_mfn_init(mid_mfn); | |
329 | ||
330 | missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); | |
331 | mid_mfn_mfn = virt_to_mfn(mid_mfn); | |
332 | if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn) | |
333 | free_p2m_page(mid_mfn); | |
334 | else | |
335 | p2m_top_mfn_p[topidx] = mid_mfn; | |
336 | } | |
337 | ||
338 | if (p2m_top[topidx][mididx] == p2m_missing) { | |
339 | /* p2m leaf page is missing */ | |
340 | unsigned long *p2m; | |
341 | ||
342 | p2m = alloc_p2m_page(); | |
343 | if (!p2m) | |
344 | return false; | |
345 | ||
346 | p2m_init(p2m); | |
347 | ||
348 | if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing) | |
349 | free_p2m_page(p2m); | |
350 | else | |
351 | mid_mfn[mididx] = virt_to_mfn(p2m); | |
352 | } | |
353 | ||
354 | return true; | |
355 | } | |
356 | ||
357 | /* Try to install p2m mapping; fail if intermediate bits missing */ | |
358 | bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) | |
359 | { | |
360 | unsigned topidx, mididx, idx; | |
361 | ||
362 | if (unlikely(pfn >= MAX_P2M_PFN)) { | |
363 | BUG_ON(mfn != INVALID_P2M_ENTRY); | |
364 | return true; | |
365 | } | |
366 | ||
367 | topidx = p2m_top_index(pfn); | |
368 | mididx = p2m_mid_index(pfn); | |
369 | idx = p2m_index(pfn); | |
370 | ||
371 | if (p2m_top[topidx][mididx] == p2m_missing) | |
372 | return mfn == INVALID_P2M_ENTRY; | |
373 | ||
374 | p2m_top[topidx][mididx][idx] = mfn; | |
375 | ||
376 | return true; | |
377 | } | |
378 | ||
379 | bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) | |
380 | { | |
381 | if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { | |
382 | BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); | |
383 | return true; | |
384 | } | |
385 | ||
386 | if (unlikely(!__set_phys_to_machine(pfn, mfn))) { | |
387 | if (!alloc_p2m(pfn)) | |
388 | return false; | |
389 | ||
390 | if (!__set_phys_to_machine(pfn, mfn)) | |
391 | return false; | |
392 | } | |
393 | ||
394 | return true; | |
395 | } | |
448f2831 JF |
396 | |
397 | #define M2P_OVERRIDE_HASH_SHIFT 10 | |
398 | #define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT) | |
399 | ||
400 | static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH); | |
401 | static DEFINE_SPINLOCK(m2p_override_lock); | |
402 | ||
403 | static void __init m2p_override_init(void) | |
404 | { | |
405 | unsigned i; | |
406 | ||
407 | m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH, | |
408 | sizeof(unsigned long)); | |
409 | ||
410 | for (i = 0; i < M2P_OVERRIDE_HASH; i++) | |
411 | INIT_LIST_HEAD(&m2p_overrides[i]); | |
412 | } | |
413 | ||
414 | static unsigned long mfn_hash(unsigned long mfn) | |
415 | { | |
416 | return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT); | |
417 | } | |
418 | ||
419 | /* Add an MFN override for a particular page */ | |
87f1d40a | 420 | int m2p_add_override(unsigned long mfn, struct page *page) |
448f2831 JF |
421 | { |
422 | unsigned long flags; | |
87f1d40a JF |
423 | unsigned long pfn; |
424 | unsigned long address; | |
425 | unsigned level; | |
426 | pte_t *ptep = NULL; | |
427 | ||
428 | pfn = page_to_pfn(page); | |
429 | if (!PageHighMem(page)) { | |
430 | address = (unsigned long)__va(pfn << PAGE_SHIFT); | |
431 | ptep = lookup_address(address, &level); | |
432 | ||
433 | if (WARN(ptep == NULL || level != PG_LEVEL_4K, | |
434 | "m2p_add_override: pfn %lx not mapped", pfn)) | |
435 | return -EINVAL; | |
436 | } | |
437 | ||
448f2831 | 438 | page->private = mfn; |
9b705f0e | 439 | page->index = pfn_to_mfn(pfn); |
448f2831 | 440 | |
9b705f0e | 441 | __set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)); |
87f1d40a JF |
442 | if (!PageHighMem(page)) |
443 | /* Just zap old mapping for now */ | |
444 | pte_clear(&init_mm, address, ptep); | |
445 | ||
448f2831 JF |
446 | spin_lock_irqsave(&m2p_override_lock, flags); |
447 | list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]); | |
448 | spin_unlock_irqrestore(&m2p_override_lock, flags); | |
87f1d40a JF |
449 | |
450 | return 0; | |
448f2831 JF |
451 | } |
452 | ||
87f1d40a | 453 | int m2p_remove_override(struct page *page) |
448f2831 JF |
454 | { |
455 | unsigned long flags; | |
9b705f0e SS |
456 | unsigned long mfn; |
457 | unsigned long pfn; | |
87f1d40a JF |
458 | unsigned long address; |
459 | unsigned level; | |
460 | pte_t *ptep = NULL; | |
9b705f0e SS |
461 | |
462 | pfn = page_to_pfn(page); | |
463 | mfn = get_phys_to_machine(pfn); | |
464 | if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) | |
87f1d40a JF |
465 | return -EINVAL; |
466 | ||
467 | if (!PageHighMem(page)) { | |
468 | address = (unsigned long)__va(pfn << PAGE_SHIFT); | |
469 | ptep = lookup_address(address, &level); | |
470 | ||
471 | if (WARN(ptep == NULL || level != PG_LEVEL_4K, | |
472 | "m2p_remove_override: pfn %lx not mapped", pfn)) | |
473 | return -EINVAL; | |
474 | } | |
9b705f0e | 475 | |
448f2831 JF |
476 | spin_lock_irqsave(&m2p_override_lock, flags); |
477 | list_del(&page->lru); | |
478 | spin_unlock_irqrestore(&m2p_override_lock, flags); | |
9b705f0e | 479 | __set_phys_to_machine(pfn, page->index); |
87f1d40a JF |
480 | |
481 | if (!PageHighMem(page)) | |
482 | set_pte_at(&init_mm, address, ptep, | |
483 | pfn_pte(pfn, PAGE_KERNEL)); | |
484 | /* No tlb flush necessary because the caller already | |
485 | * left the pte unmapped. */ | |
486 | ||
487 | return 0; | |
448f2831 JF |
488 | } |
489 | ||
490 | struct page *m2p_find_override(unsigned long mfn) | |
491 | { | |
492 | unsigned long flags; | |
493 | struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)]; | |
494 | struct page *p, *ret; | |
495 | ||
496 | ret = NULL; | |
497 | ||
498 | spin_lock_irqsave(&m2p_override_lock, flags); | |
499 | ||
500 | list_for_each_entry(p, bucket, lru) { | |
501 | if (p->private == mfn) { | |
502 | ret = p; | |
503 | break; | |
504 | } | |
505 | } | |
506 | ||
507 | spin_unlock_irqrestore(&m2p_override_lock, flags); | |
508 | ||
509 | return ret; | |
510 | } | |
511 | ||
512 | unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn) | |
513 | { | |
514 | struct page *p = m2p_find_override(mfn); | |
515 | unsigned long ret = pfn; | |
516 | ||
517 | if (p) | |
518 | ret = page_to_pfn(p); | |
519 | ||
520 | return ret; | |
521 | } | |
e1b478e4 | 522 | EXPORT_SYMBOL_GPL(m2p_find_override_pfn); |