]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /*- |
2 | * BSD LICENSE | |
3 | * | |
4 | * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. | |
5 | * All rights reserved. | |
6 | * | |
7 | * Redistribution and use in source and binary forms, with or without | |
8 | * modification, are permitted provided that the following conditions | |
9 | * are met: | |
10 | * | |
11 | * * Redistributions of source code must retain the above copyright | |
12 | * notice, this list of conditions and the following disclaimer. | |
13 | * * Redistributions in binary form must reproduce the above copyright | |
14 | * notice, this list of conditions and the following disclaimer in | |
15 | * the documentation and/or other materials provided with the | |
16 | * distribution. | |
17 | * * Neither the name of Intel Corporation nor the names of its | |
18 | * contributors may be used to endorse or promote products derived | |
19 | * from this software without specific prior written permission. | |
20 | * | |
21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
22 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
24 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
25 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
26 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
27 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
28 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
29 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
30 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
31 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
32 | */ | |
33 | ||
34 | #include <errno.h> | |
35 | #include <stdarg.h> | |
36 | #include <stdlib.h> | |
37 | #include <stdio.h> | |
38 | #include <stdint.h> | |
39 | #include <inttypes.h> | |
40 | #include <string.h> | |
41 | #include <stdarg.h> | |
42 | #include <sys/mman.h> | |
43 | #include <sys/types.h> | |
44 | #include <sys/stat.h> | |
45 | #include <sys/queue.h> | |
46 | #include <sys/file.h> | |
47 | #include <unistd.h> | |
48 | #include <limits.h> | |
49 | #include <errno.h> | |
50 | #include <sys/ioctl.h> | |
51 | #include <sys/time.h> | |
52 | ||
53 | #include <rte_log.h> | |
54 | #include <rte_memory.h> | |
55 | #include <rte_memzone.h> | |
56 | #include <rte_launch.h> | |
57 | #include <rte_eal.h> | |
58 | #include <rte_eal_memconfig.h> | |
59 | #include <rte_per_lcore.h> | |
60 | #include <rte_lcore.h> | |
61 | #include <rte_common.h> | |
62 | #include <rte_string_fns.h> | |
63 | ||
64 | #include "eal_private.h" | |
65 | #include "eal_internal_cfg.h" | |
66 | #include "eal_filesystem.h" | |
67 | #include <exec-env/rte_dom0_common.h> | |
68 | ||
69 | #define PAGE_SIZE RTE_PGSIZE_4K | |
70 | #define DEFAUL_DOM0_NAME "dom0-mem" | |
71 | ||
72 | static int xen_fd = -1; | |
73 | static const char sys_dir_path[] = "/sys/kernel/mm/dom0-mm/memsize-mB"; | |
74 | ||
75 | /* | |
76 | * Try to mmap *size bytes in /dev/zero. If it is successful, return the | |
77 | * pointer to the mmap'd area and keep *size unmodified. Else, retry | |
78 | * with a smaller zone: decrease *size by mem_size until it reaches | |
79 | * 0. In this case, return NULL. Note: this function returns an address | |
80 | * which is a multiple of mem_size size. | |
81 | */ | |
82 | static void * | |
83 | xen_get_virtual_area(size_t *size, size_t mem_size) | |
84 | { | |
85 | void *addr; | |
86 | int fd; | |
87 | long aligned_addr; | |
88 | ||
89 | RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zu bytes\n", *size); | |
90 | ||
91 | fd = open("/dev/zero", O_RDONLY); | |
92 | if (fd < 0){ | |
93 | RTE_LOG(ERR, EAL, "Cannot open /dev/zero\n"); | |
94 | return NULL; | |
95 | } | |
96 | do { | |
97 | addr = mmap(NULL, (*size) + mem_size, PROT_READ, | |
98 | MAP_PRIVATE, fd, 0); | |
99 | if (addr == MAP_FAILED) | |
100 | *size -= mem_size; | |
101 | } while (addr == MAP_FAILED && *size > 0); | |
102 | ||
103 | if (addr == MAP_FAILED) { | |
104 | close(fd); | |
105 | RTE_LOG(ERR, EAL, "Cannot get a virtual area\n"); | |
106 | return NULL; | |
107 | } | |
108 | ||
109 | munmap(addr, (*size) + mem_size); | |
110 | close(fd); | |
111 | ||
112 | /* align addr to a mem_size boundary */ | |
113 | aligned_addr = (uintptr_t)addr; | |
114 | aligned_addr = RTE_ALIGN_CEIL(aligned_addr, mem_size); | |
115 | addr = (void *)(aligned_addr); | |
116 | ||
117 | RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n", | |
118 | addr, *size); | |
119 | ||
120 | return addr; | |
121 | } | |
122 | ||
123 | /** | |
124 | * Get memory size configuration from /sys/devices/virtual/misc/dom0_mm | |
125 | * /memsize-mB/memsize file, and the size unit is mB. | |
126 | */ | |
127 | static int | |
128 | get_xen_memory_size(void) | |
129 | { | |
130 | char path[PATH_MAX]; | |
131 | unsigned long mem_size = 0; | |
132 | static const char *file_name; | |
133 | ||
134 | file_name = "memsize"; | |
135 | snprintf(path, sizeof(path), "%s/%s", | |
136 | sys_dir_path, file_name); | |
137 | ||
138 | if (eal_parse_sysfs_value(path, &mem_size) < 0) | |
139 | return -1; | |
140 | ||
141 | if (mem_size == 0) | |
142 | rte_exit(EXIT_FAILURE,"XEN-DOM0:the %s/%s was not" | |
143 | " configured.\n",sys_dir_path, file_name); | |
144 | if (mem_size % 2) | |
145 | rte_exit(EXIT_FAILURE,"XEN-DOM0:the %s/%s must be" | |
146 | " even number.\n",sys_dir_path, file_name); | |
147 | ||
148 | if (mem_size > DOM0_CONFIG_MEMSIZE) | |
149 | rte_exit(EXIT_FAILURE,"XEN-DOM0:the %s/%s should not be larger" | |
150 | " than %d mB\n",sys_dir_path, file_name, DOM0_CONFIG_MEMSIZE); | |
151 | ||
152 | return mem_size; | |
153 | } | |
154 | ||
155 | /** | |
156 | * Based on physical address to caculate MFN in Xen Dom0. | |
157 | */ | |
158 | phys_addr_t | |
159 | rte_xen_mem_phy2mch(int32_t memseg_id, const phys_addr_t phy_addr) | |
160 | { | |
161 | int mfn_id, i; | |
162 | uint64_t mfn, mfn_offset; | |
163 | struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; | |
164 | struct rte_memseg *memseg = mcfg->memseg; | |
165 | ||
166 | /* find the memory segment owning the physical address */ | |
167 | if (memseg_id == -1) { | |
168 | for (i = 0; i < RTE_MAX_MEMSEG; i++) { | |
169 | if ((phy_addr >= memseg[i].phys_addr) && | |
170 | (phy_addr < memseg[i].phys_addr + | |
171 | memseg[i].len)) { | |
172 | memseg_id = i; | |
173 | break; | |
174 | } | |
175 | } | |
176 | if (memseg_id == -1) | |
177 | return RTE_BAD_PHYS_ADDR; | |
178 | } | |
179 | ||
180 | mfn_id = (phy_addr - memseg[memseg_id].phys_addr) / RTE_PGSIZE_2M; | |
181 | ||
182 | /*the MFN is contiguous in 2M */ | |
183 | mfn_offset = (phy_addr - memseg[memseg_id].phys_addr) % | |
184 | RTE_PGSIZE_2M / PAGE_SIZE; | |
185 | mfn = mfn_offset + memseg[memseg_id].mfn[mfn_id]; | |
186 | ||
187 | /** return mechine address */ | |
188 | return mfn * PAGE_SIZE + phy_addr % PAGE_SIZE; | |
189 | } | |
190 | ||
191 | int | |
192 | rte_xen_dom0_memory_init(void) | |
193 | { | |
194 | void *vir_addr, *vma_addr = NULL; | |
195 | int err, ret = 0; | |
196 | uint32_t i, requested, mem_size, memseg_idx, num_memseg = 0; | |
197 | size_t vma_len = 0; | |
198 | struct memory_info meminfo; | |
199 | struct memseg_info seginfo[RTE_MAX_MEMSEG]; | |
200 | int flags, page_size = getpagesize(); | |
201 | struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; | |
202 | struct rte_memseg *memseg = mcfg->memseg; | |
203 | uint64_t total_mem = internal_config.memory; | |
204 | ||
205 | memset(seginfo, 0, sizeof(seginfo)); | |
206 | memset(&meminfo, 0, sizeof(struct memory_info)); | |
207 | ||
208 | mem_size = get_xen_memory_size(); | |
209 | requested = (unsigned) (total_mem / 0x100000); | |
210 | if (requested > mem_size) | |
211 | /* if we didn't satisfy total memory requirements */ | |
212 | rte_exit(EXIT_FAILURE,"Not enough memory available! Requested: %uMB," | |
213 | " available: %uMB\n", requested, mem_size); | |
214 | else if (total_mem != 0) | |
215 | mem_size = requested; | |
216 | ||
217 | /* Check FD and open once */ | |
218 | if (xen_fd < 0) { | |
219 | xen_fd = open(DOM0_MM_DEV, O_RDWR); | |
220 | if (xen_fd < 0) { | |
221 | RTE_LOG(ERR, EAL, "Can not open %s\n",DOM0_MM_DEV); | |
222 | return -1; | |
223 | } | |
224 | } | |
225 | ||
226 | meminfo.size = mem_size; | |
227 | ||
228 | /* construct memory mangement name for Dom0 */ | |
229 | snprintf(meminfo.name, DOM0_NAME_MAX, "%s-%s", | |
230 | internal_config.hugefile_prefix, DEFAUL_DOM0_NAME); | |
231 | ||
232 | /* Notify kernel driver to allocate memory */ | |
233 | ret = ioctl(xen_fd, RTE_DOM0_IOCTL_PREPARE_MEMSEG, &meminfo); | |
234 | if (ret < 0) { | |
235 | RTE_LOG(ERR, EAL, "XEN DOM0:failed to get memory\n"); | |
236 | err = -EIO; | |
237 | goto fail; | |
238 | } | |
239 | ||
240 | /* Get number of memory segment from driver */ | |
241 | ret = ioctl(xen_fd, RTE_DOM0_IOCTL_GET_NUM_MEMSEG, &num_memseg); | |
242 | if (ret < 0) { | |
243 | RTE_LOG(ERR, EAL, "XEN DOM0:failed to get memseg count.\n"); | |
244 | err = -EIO; | |
245 | goto fail; | |
246 | } | |
247 | ||
248 | if(num_memseg > RTE_MAX_MEMSEG){ | |
249 | RTE_LOG(ERR, EAL, "XEN DOM0: the memseg count %d is greater" | |
250 | " than max memseg %d.\n",num_memseg, RTE_MAX_MEMSEG); | |
251 | err = -EIO; | |
252 | goto fail; | |
253 | } | |
254 | ||
255 | /* get all memory segements information */ | |
256 | ret = ioctl(xen_fd, RTE_DOM0_IOCTL_GET_MEMSEG_INFO, seginfo); | |
257 | if (ret < 0) { | |
258 | RTE_LOG(ERR, EAL, "XEN DOM0:failed to get memseg info.\n"); | |
259 | err = -EIO; | |
260 | goto fail; | |
261 | } | |
262 | ||
263 | /* map all memory segments to contiguous user space */ | |
264 | for (memseg_idx = 0; memseg_idx < num_memseg; memseg_idx++) | |
265 | { | |
266 | vma_len = seginfo[memseg_idx].size; | |
267 | ||
268 | /** | |
269 | * get the biggest virtual memory area up to vma_len. If it fails, | |
270 | * vma_addr is NULL, so let the kernel provide the address. | |
271 | */ | |
272 | vma_addr = xen_get_virtual_area(&vma_len, RTE_PGSIZE_2M); | |
273 | if (vma_addr == NULL) { | |
274 | flags = MAP_SHARED; | |
275 | vma_len = RTE_PGSIZE_2M; | |
276 | } else | |
277 | flags = MAP_SHARED | MAP_FIXED; | |
278 | ||
279 | seginfo[memseg_idx].size = vma_len; | |
280 | vir_addr = mmap(vma_addr, seginfo[memseg_idx].size, | |
281 | PROT_READ|PROT_WRITE, flags, xen_fd, | |
282 | memseg_idx * page_size); | |
283 | if (vir_addr == MAP_FAILED) { | |
284 | RTE_LOG(ERR, EAL, "XEN DOM0:Could not mmap %s\n", | |
285 | DOM0_MM_DEV); | |
286 | err = -EIO; | |
287 | goto fail; | |
288 | } | |
289 | ||
290 | memseg[memseg_idx].addr = vir_addr; | |
291 | memseg[memseg_idx].phys_addr = page_size * | |
292 | seginfo[memseg_idx].pfn ; | |
293 | memseg[memseg_idx].len = seginfo[memseg_idx].size; | |
294 | for ( i = 0; i < seginfo[memseg_idx].size / RTE_PGSIZE_2M; i++) | |
295 | memseg[memseg_idx].mfn[i] = seginfo[memseg_idx].mfn[i]; | |
296 | ||
297 | /* MFNs are continuous in 2M, so assume that page size is 2M */ | |
298 | memseg[memseg_idx].hugepage_sz = RTE_PGSIZE_2M; | |
299 | ||
300 | memseg[memseg_idx].nchannel = mcfg->nchannel; | |
301 | memseg[memseg_idx].nrank = mcfg->nrank; | |
302 | ||
303 | /* NUMA is not suppoted in Xen Dom0, so only set socket 0*/ | |
304 | memseg[memseg_idx].socket_id = 0; | |
305 | } | |
306 | ||
307 | return 0; | |
308 | fail: | |
309 | if (xen_fd > 0) { | |
310 | close(xen_fd); | |
311 | xen_fd = -1; | |
312 | } | |
313 | return err; | |
314 | } | |
315 | ||
316 | /* | |
317 | * This creates the memory mappings in the secondary process to match that of | |
318 | * the server process. It goes through each memory segment in the DPDK runtime | |
319 | * configuration, mapping them in order to form a contiguous block in the | |
320 | * virtual memory space | |
321 | */ | |
322 | int | |
323 | rte_xen_dom0_memory_attach(void) | |
324 | { | |
325 | const struct rte_mem_config *mcfg; | |
326 | unsigned s = 0; /* s used to track the segment number */ | |
327 | int xen_fd = -1; | |
328 | int ret = -1; | |
329 | void *vir_addr; | |
330 | char name[DOM0_NAME_MAX] = {0}; | |
331 | int page_size = getpagesize(); | |
332 | ||
333 | mcfg = rte_eal_get_configuration()->mem_config; | |
334 | ||
335 | /* Check FD and open once */ | |
336 | if (xen_fd < 0) { | |
337 | xen_fd = open(DOM0_MM_DEV, O_RDWR); | |
338 | if (xen_fd < 0) { | |
339 | RTE_LOG(ERR, EAL, "Can not open %s\n",DOM0_MM_DEV); | |
340 | goto error; | |
341 | } | |
342 | } | |
343 | ||
344 | /* construct memory mangement name for Dom0 */ | |
345 | snprintf(name, DOM0_NAME_MAX, "%s-%s", | |
346 | internal_config.hugefile_prefix, DEFAUL_DOM0_NAME); | |
347 | /* attach to memory segments of primary process */ | |
348 | ret = ioctl(xen_fd, RTE_DOM0_IOCTL_ATTACH_TO_MEMSEG, name); | |
349 | if (ret) { | |
350 | RTE_LOG(ERR, EAL,"attach memory segments fail.\n"); | |
351 | goto error; | |
352 | } | |
353 | ||
354 | /* map all segments into memory to make sure we get the addrs */ | |
355 | for (s = 0; s < RTE_MAX_MEMSEG; ++s) { | |
356 | ||
357 | /* | |
358 | * the first memory segment with len==0 is the one that | |
359 | * follows the last valid segment. | |
360 | */ | |
361 | if (mcfg->memseg[s].len == 0) | |
362 | break; | |
363 | ||
364 | vir_addr = mmap(mcfg->memseg[s].addr, mcfg->memseg[s].len, | |
365 | PROT_READ|PROT_WRITE, MAP_SHARED|MAP_FIXED, xen_fd, | |
366 | s * page_size); | |
367 | if (vir_addr == MAP_FAILED) { | |
368 | RTE_LOG(ERR, EAL, "Could not mmap %llu bytes " | |
369 | "in %s to requested address [%p]\n", | |
370 | (unsigned long long)mcfg->memseg[s].len, DOM0_MM_DEV, | |
371 | mcfg->memseg[s].addr); | |
372 | goto error; | |
373 | } | |
374 | } | |
375 | return 0; | |
376 | ||
377 | error: | |
378 | if (xen_fd >= 0) { | |
379 | close(xen_fd); | |
380 | xen_fd = -1; | |
381 | } | |
382 | return -1; | |
383 | } |