2 * Xen leaves the responsibility for maintaining p2m mappings to the
3 * guests themselves, but it must also access and update the p2m array
4 * during suspend/resume when all the pages are reallocated.
6 * The p2m table is logically a flat array, but we implement it as a
7 * three-level tree to allow the address space to be sparse.
13 * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
15 * p2m p2m p2m p2m p2m p2m p2m ...
17 * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
19 * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
20 * maximum representable pseudo-physical address space is:
21 * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
23 * P2M_PER_PAGE depends on the architecture, as a mfn is always
24 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
25 * 512 and 1024 entries respectively.
28 #include <linux/init.h>
29 #include <linux/module.h>
30 #include <linux/list.h>
31 #include <linux/hash.h>
32 #include <linux/sched.h>
34 #include <asm/cache.h>
35 #include <asm/setup.h>
37 #include <asm/xen/page.h>
38 #include <asm/xen/hypercall.h>
39 #include <asm/xen/hypervisor.h>
43 static void __init
m2p_override_init(void);
45 unsigned long xen_max_p2m_pfn __read_mostly
;
47 #define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
48 #define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
49 #define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
51 #define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
53 /* Placeholders for holes in the address space */
54 static RESERVE_BRK_ARRAY(unsigned long, p2m_missing
, P2M_PER_PAGE
);
55 static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing
, P2M_MID_PER_PAGE
);
56 static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn
, P2M_MID_PER_PAGE
);
58 static RESERVE_BRK_ARRAY(unsigned long **, p2m_top
, P2M_TOP_PER_PAGE
);
59 static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn
, P2M_TOP_PER_PAGE
);
60 static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p
, P2M_TOP_PER_PAGE
);
62 RESERVE_BRK(p2m_mid
, PAGE_SIZE
* (MAX_DOMAIN_PAGES
/ (P2M_PER_PAGE
* P2M_MID_PER_PAGE
)));
63 RESERVE_BRK(p2m_mid_mfn
, PAGE_SIZE
* (MAX_DOMAIN_PAGES
/ (P2M_PER_PAGE
* P2M_MID_PER_PAGE
)));
65 static inline unsigned p2m_top_index(unsigned long pfn
)
67 BUG_ON(pfn
>= MAX_P2M_PFN
);
68 return pfn
/ (P2M_MID_PER_PAGE
* P2M_PER_PAGE
);
71 static inline unsigned p2m_mid_index(unsigned long pfn
)
73 return (pfn
/ P2M_PER_PAGE
) % P2M_MID_PER_PAGE
;
76 static inline unsigned p2m_index(unsigned long pfn
)
78 return pfn
% P2M_PER_PAGE
;
81 static void p2m_top_init(unsigned long ***top
)
85 for (i
= 0; i
< P2M_TOP_PER_PAGE
; i
++)
86 top
[i
] = p2m_mid_missing
;
89 static void p2m_top_mfn_init(unsigned long *top
)
93 for (i
= 0; i
< P2M_TOP_PER_PAGE
; i
++)
94 top
[i
] = virt_to_mfn(p2m_mid_missing_mfn
);
97 static void p2m_top_mfn_p_init(unsigned long **top
)
101 for (i
= 0; i
< P2M_TOP_PER_PAGE
; i
++)
102 top
[i
] = p2m_mid_missing_mfn
;
105 static void p2m_mid_init(unsigned long **mid
)
109 for (i
= 0; i
< P2M_MID_PER_PAGE
; i
++)
110 mid
[i
] = p2m_missing
;
113 static void p2m_mid_mfn_init(unsigned long *mid
)
117 for (i
= 0; i
< P2M_MID_PER_PAGE
; i
++)
118 mid
[i
] = virt_to_mfn(p2m_missing
);
121 static void p2m_init(unsigned long *p2m
)
125 for (i
= 0; i
< P2M_MID_PER_PAGE
; i
++)
126 p2m
[i
] = INVALID_P2M_ENTRY
;
130 * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
132 * This is called both at boot time, and after resuming from suspend:
133 * - At boot time we're called very early, and must use extend_brk()
134 * to allocate memory.
136 * - After resume we're called from within stop_machine, but the mfn
137 * tree should alreay be completely allocated.
139 void xen_build_mfn_list_list(void)
143 /* Pre-initialize p2m_top_mfn to be completely missing */
144 if (p2m_top_mfn
== NULL
) {
145 p2m_mid_missing_mfn
= extend_brk(PAGE_SIZE
, PAGE_SIZE
);
146 p2m_mid_mfn_init(p2m_mid_missing_mfn
);
148 p2m_top_mfn_p
= extend_brk(PAGE_SIZE
, PAGE_SIZE
);
149 p2m_top_mfn_p_init(p2m_top_mfn_p
);
151 p2m_top_mfn
= extend_brk(PAGE_SIZE
, PAGE_SIZE
);
152 p2m_top_mfn_init(p2m_top_mfn
);
154 /* Reinitialise, mfn's all change after migration */
155 p2m_mid_mfn_init(p2m_mid_missing_mfn
);
158 for (pfn
= 0; pfn
< xen_max_p2m_pfn
; pfn
+= P2M_PER_PAGE
) {
159 unsigned topidx
= p2m_top_index(pfn
);
160 unsigned mididx
= p2m_mid_index(pfn
);
162 unsigned long *mid_mfn_p
;
164 mid
= p2m_top
[topidx
];
165 mid_mfn_p
= p2m_top_mfn_p
[topidx
];
167 /* Don't bother allocating any mfn mid levels if
168 * they're just missing, just update the stored mfn,
169 * since all could have changed over a migrate.
171 if (mid
== p2m_mid_missing
) {
173 BUG_ON(mid_mfn_p
!= p2m_mid_missing_mfn
);
174 p2m_top_mfn
[topidx
] = virt_to_mfn(p2m_mid_missing_mfn
);
175 pfn
+= (P2M_MID_PER_PAGE
- 1) * P2M_PER_PAGE
;
179 if (mid_mfn_p
== p2m_mid_missing_mfn
) {
181 * XXX boot-time only! We should never find
182 * missing parts of the mfn tree after
183 * runtime. extend_brk() will BUG if we call
186 mid_mfn_p
= extend_brk(PAGE_SIZE
, PAGE_SIZE
);
187 p2m_mid_mfn_init(mid_mfn_p
);
189 p2m_top_mfn_p
[topidx
] = mid_mfn_p
;
192 p2m_top_mfn
[topidx
] = virt_to_mfn(mid_mfn_p
);
193 mid_mfn_p
[mididx
] = virt_to_mfn(mid
[mididx
]);
197 void xen_setup_mfn_list_list(void)
199 BUG_ON(HYPERVISOR_shared_info
== &xen_dummy_shared_info
);
201 HYPERVISOR_shared_info
->arch
.pfn_to_mfn_frame_list_list
=
202 virt_to_mfn(p2m_top_mfn
);
203 HYPERVISOR_shared_info
->arch
.max_pfn
= xen_max_p2m_pfn
;
206 /* Set up p2m_top to point to the domain-builder provided p2m pages */
207 void __init
xen_build_dynamic_phys_to_machine(void)
209 unsigned long *mfn_list
= (unsigned long *)xen_start_info
->mfn_list
;
210 unsigned long max_pfn
= min(MAX_DOMAIN_PAGES
, xen_start_info
->nr_pages
);
213 xen_max_p2m_pfn
= max_pfn
;
215 p2m_missing
= extend_brk(PAGE_SIZE
, PAGE_SIZE
);
216 p2m_init(p2m_missing
);
218 p2m_mid_missing
= extend_brk(PAGE_SIZE
, PAGE_SIZE
);
219 p2m_mid_init(p2m_mid_missing
);
221 p2m_top
= extend_brk(PAGE_SIZE
, PAGE_SIZE
);
222 p2m_top_init(p2m_top
);
225 * The domain builder gives us a pre-constructed p2m array in
226 * mfn_list for all the pages initially given to us, so we just
227 * need to graft that into our tree structure.
229 for (pfn
= 0; pfn
< max_pfn
; pfn
+= P2M_PER_PAGE
) {
230 unsigned topidx
= p2m_top_index(pfn
);
231 unsigned mididx
= p2m_mid_index(pfn
);
233 if (p2m_top
[topidx
] == p2m_mid_missing
) {
234 unsigned long **mid
= extend_brk(PAGE_SIZE
, PAGE_SIZE
);
237 p2m_top
[topidx
] = mid
;
240 p2m_top
[topidx
][mididx
] = &mfn_list
[pfn
];
246 unsigned long get_phys_to_machine(unsigned long pfn
)
248 unsigned topidx
, mididx
, idx
;
250 if (unlikely(pfn
>= MAX_P2M_PFN
))
251 return INVALID_P2M_ENTRY
;
253 topidx
= p2m_top_index(pfn
);
254 mididx
= p2m_mid_index(pfn
);
255 idx
= p2m_index(pfn
);
257 return p2m_top
[topidx
][mididx
][idx
];
259 EXPORT_SYMBOL_GPL(get_phys_to_machine
);
261 static void *alloc_p2m_page(void)
263 return (void *)__get_free_page(GFP_KERNEL
| __GFP_REPEAT
);
266 static void free_p2m_page(void *p
)
268 free_page((unsigned long)p
);
272 * Fully allocate the p2m structure for a given pfn. We need to check
273 * that both the top and mid levels are allocated, and make sure the
274 * parallel mfn tree is kept in sync. We may race with other cpus, so
275 * the new pages are installed with cmpxchg; if we lose the race then
276 * simply free the page we allocated and use the one that's there.
278 static bool alloc_p2m(unsigned long pfn
)
280 unsigned topidx
, mididx
;
281 unsigned long ***top_p
, **mid
;
282 unsigned long *top_mfn_p
, *mid_mfn
;
284 topidx
= p2m_top_index(pfn
);
285 mididx
= p2m_mid_index(pfn
);
287 top_p
= &p2m_top
[topidx
];
290 if (mid
== p2m_mid_missing
) {
291 /* Mid level is missing, allocate a new one */
292 mid
= alloc_p2m_page();
298 if (cmpxchg(top_p
, p2m_mid_missing
, mid
) != p2m_mid_missing
)
302 top_mfn_p
= &p2m_top_mfn
[topidx
];
303 mid_mfn
= p2m_top_mfn_p
[topidx
];
305 BUG_ON(virt_to_mfn(mid_mfn
) != *top_mfn_p
);
307 if (mid_mfn
== p2m_mid_missing_mfn
) {
308 /* Separately check the mid mfn level */
309 unsigned long missing_mfn
;
310 unsigned long mid_mfn_mfn
;
312 mid_mfn
= alloc_p2m_page();
316 p2m_mid_mfn_init(mid_mfn
);
318 missing_mfn
= virt_to_mfn(p2m_mid_missing_mfn
);
319 mid_mfn_mfn
= virt_to_mfn(mid_mfn
);
320 if (cmpxchg(top_mfn_p
, missing_mfn
, mid_mfn_mfn
) != missing_mfn
)
321 free_p2m_page(mid_mfn
);
323 p2m_top_mfn_p
[topidx
] = mid_mfn
;
326 if (p2m_top
[topidx
][mididx
] == p2m_missing
) {
327 /* p2m leaf page is missing */
330 p2m
= alloc_p2m_page();
336 if (cmpxchg(&mid
[mididx
], p2m_missing
, p2m
) != p2m_missing
)
339 mid_mfn
[mididx
] = virt_to_mfn(p2m
);
345 /* Try to install p2m mapping; fail if intermediate bits missing */
346 bool __set_phys_to_machine(unsigned long pfn
, unsigned long mfn
)
348 unsigned topidx
, mididx
, idx
;
350 if (unlikely(pfn
>= MAX_P2M_PFN
)) {
351 BUG_ON(mfn
!= INVALID_P2M_ENTRY
);
355 topidx
= p2m_top_index(pfn
);
356 mididx
= p2m_mid_index(pfn
);
357 idx
= p2m_index(pfn
);
359 if (p2m_top
[topidx
][mididx
] == p2m_missing
)
360 return mfn
== INVALID_P2M_ENTRY
;
362 p2m_top
[topidx
][mididx
][idx
] = mfn
;
367 bool set_phys_to_machine(unsigned long pfn
, unsigned long mfn
)
369 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap
))) {
370 BUG_ON(pfn
!= mfn
&& mfn
!= INVALID_P2M_ENTRY
);
374 if (unlikely(!__set_phys_to_machine(pfn
, mfn
))) {
378 if (!__set_phys_to_machine(pfn
, mfn
))
385 #define M2P_OVERRIDE_HASH_SHIFT 10
386 #define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT)
388 static RESERVE_BRK_ARRAY(struct list_head
, m2p_overrides
, M2P_OVERRIDE_HASH
);
389 static DEFINE_SPINLOCK(m2p_override_lock
);
391 static void __init
m2p_override_init(void)
395 m2p_overrides
= extend_brk(sizeof(*m2p_overrides
) * M2P_OVERRIDE_HASH
,
396 sizeof(unsigned long));
398 for (i
= 0; i
< M2P_OVERRIDE_HASH
; i
++)
399 INIT_LIST_HEAD(&m2p_overrides
[i
]);
402 static unsigned long mfn_hash(unsigned long mfn
)
404 return hash_long(mfn
, M2P_OVERRIDE_HASH_SHIFT
);
407 /* Add an MFN override for a particular page */
408 int m2p_add_override(unsigned long mfn
, struct page
*page
)
412 unsigned long address
;
416 pfn
= page_to_pfn(page
);
417 if (!PageHighMem(page
)) {
418 address
= (unsigned long)__va(pfn
<< PAGE_SHIFT
);
419 ptep
= lookup_address(address
, &level
);
421 if (WARN(ptep
== NULL
|| level
!= PG_LEVEL_4K
,
422 "m2p_add_override: pfn %lx not mapped", pfn
))
427 page
->index
= pfn_to_mfn(pfn
);
429 __set_phys_to_machine(pfn
, FOREIGN_FRAME(mfn
));
430 if (!PageHighMem(page
))
431 /* Just zap old mapping for now */
432 pte_clear(&init_mm
, address
, ptep
);
434 spin_lock_irqsave(&m2p_override_lock
, flags
);
435 list_add(&page
->lru
, &m2p_overrides
[mfn_hash(mfn
)]);
436 spin_unlock_irqrestore(&m2p_override_lock
, flags
);
441 int m2p_remove_override(struct page
*page
)
446 unsigned long address
;
450 pfn
= page_to_pfn(page
);
451 mfn
= get_phys_to_machine(pfn
);
452 if (mfn
== INVALID_P2M_ENTRY
|| !(mfn
& FOREIGN_FRAME_BIT
))
455 if (!PageHighMem(page
)) {
456 address
= (unsigned long)__va(pfn
<< PAGE_SHIFT
);
457 ptep
= lookup_address(address
, &level
);
459 if (WARN(ptep
== NULL
|| level
!= PG_LEVEL_4K
,
460 "m2p_remove_override: pfn %lx not mapped", pfn
))
464 spin_lock_irqsave(&m2p_override_lock
, flags
);
465 list_del(&page
->lru
);
466 spin_unlock_irqrestore(&m2p_override_lock
, flags
);
467 __set_phys_to_machine(pfn
, page
->index
);
469 if (!PageHighMem(page
))
470 set_pte_at(&init_mm
, address
, ptep
,
471 pfn_pte(pfn
, PAGE_KERNEL
));
472 /* No tlb flush necessary because the caller already
473 * left the pte unmapped. */
478 struct page
*m2p_find_override(unsigned long mfn
)
481 struct list_head
*bucket
= &m2p_overrides
[mfn_hash(mfn
)];
482 struct page
*p
, *ret
;
486 spin_lock_irqsave(&m2p_override_lock
, flags
);
488 list_for_each_entry(p
, bucket
, lru
) {
489 if (p
->private == mfn
) {
495 spin_unlock_irqrestore(&m2p_override_lock
, flags
);
500 unsigned long m2p_find_override_pfn(unsigned long mfn
, unsigned long pfn
)
502 struct page
*p
= m2p_find_override(mfn
);
503 unsigned long ret
= pfn
;
506 ret
= page_to_pfn(p
);
510 EXPORT_SYMBOL_GPL(m2p_find_override_pfn
);