]>
Commit | Line | Data |
---|---|---|
769071ac AV |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Author: Andrei Vagin <avagin@openvz.org> | |
4 | * Author: Dmitry Safonov <dima@arista.com> | |
5 | */ | |
6 | ||
7 | #include <linux/time_namespace.h> | |
8 | #include <linux/user_namespace.h> | |
9 | #include <linux/sched/signal.h> | |
10 | #include <linux/sched/task.h> | |
2d6b01bd | 11 | #include <linux/clocksource.h> |
04a8682a | 12 | #include <linux/seq_file.h> |
769071ac AV |
13 | #include <linux/proc_ns.h> |
14 | #include <linux/export.h> | |
15 | #include <linux/time.h> | |
16 | #include <linux/slab.h> | |
17 | #include <linux/cred.h> | |
18 | #include <linux/err.h> | |
af993f58 | 19 | #include <linux/mm.h> |
769071ac | 20 | |
afaa7b5a DS |
21 | #include <vdso/datapage.h> |
22 | ||
89dd8eec AV |
23 | ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, |
24 | struct timens_offsets *ns_offsets) | |
25 | { | |
26 | ktime_t offset; | |
27 | ||
28 | switch (clockid) { | |
29 | case CLOCK_MONOTONIC: | |
30 | offset = timespec64_to_ktime(ns_offsets->monotonic); | |
31 | break; | |
32 | case CLOCK_BOOTTIME: | |
33 | case CLOCK_BOOTTIME_ALARM: | |
34 | offset = timespec64_to_ktime(ns_offsets->boottime); | |
35 | break; | |
36 | default: | |
37 | return tim; | |
38 | } | |
39 | ||
40 | /* | |
41 | * Check that @tim value is in [offset, KTIME_MAX + offset] | |
42 | * and subtract offset. | |
43 | */ | |
44 | if (tim < offset) { | |
45 | /* | |
46 | * User can specify @tim *absolute* value - if it's lesser than | |
47 | * the time namespace's offset - it's already expired. | |
48 | */ | |
49 | tim = 0; | |
50 | } else { | |
51 | tim = ktime_sub(tim, offset); | |
52 | if (unlikely(tim > KTIME_MAX)) | |
53 | tim = KTIME_MAX; | |
54 | } | |
55 | ||
56 | return tim; | |
57 | } | |
58 | ||
769071ac AV |
59 | static struct ucounts *inc_time_namespaces(struct user_namespace *ns) |
60 | { | |
61 | return inc_ucount(ns, current_euid(), UCOUNT_TIME_NAMESPACES); | |
62 | } | |
63 | ||
64 | static void dec_time_namespaces(struct ucounts *ucounts) | |
65 | { | |
66 | dec_ucount(ucounts, UCOUNT_TIME_NAMESPACES); | |
67 | } | |
68 | ||
69 | /** | |
70 | * clone_time_ns - Clone a time namespace | |
71 | * @user_ns: User namespace which owns a new namespace. | |
72 | * @old_ns: Namespace to clone | |
73 | * | |
74 | * Clone @old_ns and set the clone refcount to 1 | |
75 | * | |
76 | * Return: The new namespace or ERR_PTR. | |
77 | */ | |
78 | static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, | |
79 | struct time_namespace *old_ns) | |
80 | { | |
81 | struct time_namespace *ns; | |
82 | struct ucounts *ucounts; | |
83 | int err; | |
84 | ||
85 | err = -ENOSPC; | |
86 | ucounts = inc_time_namespaces(user_ns); | |
87 | if (!ucounts) | |
88 | goto fail; | |
89 | ||
90 | err = -ENOMEM; | |
91 | ns = kmalloc(sizeof(*ns), GFP_KERNEL); | |
92 | if (!ns) | |
93 | goto fail_dec; | |
94 | ||
28c41efd | 95 | refcount_set(&ns->ns.count, 1); |
769071ac | 96 | |
afaa7b5a DS |
97 | ns->vvar_page = alloc_page(GFP_KERNEL | __GFP_ZERO); |
98 | if (!ns->vvar_page) | |
99 | goto fail_free; | |
100 | ||
769071ac AV |
101 | err = ns_alloc_inum(&ns->ns); |
102 | if (err) | |
afaa7b5a | 103 | goto fail_free_page; |
769071ac AV |
104 | |
105 | ns->ucounts = ucounts; | |
106 | ns->ns.ops = &timens_operations; | |
107 | ns->user_ns = get_user_ns(user_ns); | |
af993f58 | 108 | ns->offsets = old_ns->offsets; |
afaa7b5a | 109 | ns->frozen_offsets = false; |
769071ac AV |
110 | return ns; |
111 | ||
afaa7b5a DS |
112 | fail_free_page: |
113 | __free_page(ns->vvar_page); | |
769071ac AV |
114 | fail_free: |
115 | kfree(ns); | |
116 | fail_dec: | |
117 | dec_time_namespaces(ucounts); | |
118 | fail: | |
119 | return ERR_PTR(err); | |
120 | } | |
121 | ||
122 | /** | |
123 | * copy_time_ns - Create timens_for_children from @old_ns | |
124 | * @flags: Cloning flags | |
125 | * @user_ns: User namespace which owns a new namespace. | |
126 | * @old_ns: Namespace to clone | |
127 | * | |
128 | * If CLONE_NEWTIME specified in @flags, creates a new timens_for_children; | |
129 | * adds a refcounter to @old_ns otherwise. | |
130 | * | |
131 | * Return: timens_for_children namespace or ERR_PTR. | |
132 | */ | |
133 | struct time_namespace *copy_time_ns(unsigned long flags, | |
134 | struct user_namespace *user_ns, struct time_namespace *old_ns) | |
135 | { | |
136 | if (!(flags & CLONE_NEWTIME)) | |
137 | return get_time_ns(old_ns); | |
138 | ||
139 | return clone_time_ns(user_ns, old_ns); | |
140 | } | |
141 | ||
afaa7b5a DS |
142 | static struct timens_offset offset_from_ts(struct timespec64 off) |
143 | { | |
144 | struct timens_offset ret; | |
145 | ||
146 | ret.sec = off.tv_sec; | |
147 | ret.nsec = off.tv_nsec; | |
148 | ||
149 | return ret; | |
150 | } | |
151 | ||
152 | /* | |
153 | * A time namespace VVAR page has the same layout as the VVAR page which | |
154 | * contains the system wide VDSO data. | |
155 | * | |
156 | * For a normal task the VVAR pages are installed in the normal ordering: | |
157 | * VVAR | |
158 | * PVCLOCK | |
159 | * HVCLOCK | |
160 | * TIMENS <- Not really required | |
161 | * | |
162 | * Now for a timens task the pages are installed in the following order: | |
163 | * TIMENS | |
164 | * PVCLOCK | |
165 | * HVCLOCK | |
166 | * VVAR | |
167 | * | |
168 | * The check for vdso_data->clock_mode is in the unlikely path of | |
169 | * the seq begin magic. So for the non-timens case most of the time | |
170 | * 'seq' is even, so the branch is not taken. | |
171 | * | |
172 | * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check | |
173 | * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the | |
174 | * update to finish and for 'seq' to become even anyway. | |
175 | * | |
2d6b01bd TG |
176 | * Timens page has vdso_data->clock_mode set to VDSO_CLOCKMODE_TIMENS which |
177 | * enforces the time namespace handling path. | |
afaa7b5a DS |
178 | */ |
179 | static void timens_setup_vdso_data(struct vdso_data *vdata, | |
180 | struct time_namespace *ns) | |
181 | { | |
182 | struct timens_offset *offset = vdata->offset; | |
183 | struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); | |
184 | struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); | |
185 | ||
186 | vdata->seq = 1; | |
2d6b01bd | 187 | vdata->clock_mode = VDSO_CLOCKMODE_TIMENS; |
afaa7b5a DS |
188 | offset[CLOCK_MONOTONIC] = monotonic; |
189 | offset[CLOCK_MONOTONIC_RAW] = monotonic; | |
190 | offset[CLOCK_MONOTONIC_COARSE] = monotonic; | |
191 | offset[CLOCK_BOOTTIME] = boottime; | |
192 | offset[CLOCK_BOOTTIME_ALARM] = boottime; | |
193 | } | |
194 | ||
195 | /* | |
196 | * Protects possibly multiple offsets writers racing each other | |
197 | * and tasks entering the namespace. | |
198 | */ | |
199 | static DEFINE_MUTEX(offset_lock); | |
200 | ||
201 | static void timens_set_vvar_page(struct task_struct *task, | |
202 | struct time_namespace *ns) | |
203 | { | |
204 | struct vdso_data *vdata; | |
205 | unsigned int i; | |
206 | ||
207 | if (ns == &init_time_ns) | |
208 | return; | |
209 | ||
210 | /* Fast-path, taken by every task in namespace except the first. */ | |
211 | if (likely(ns->frozen_offsets)) | |
212 | return; | |
213 | ||
214 | mutex_lock(&offset_lock); | |
215 | /* Nothing to-do: vvar_page has been already initialized. */ | |
216 | if (ns->frozen_offsets) | |
217 | goto out; | |
218 | ||
219 | ns->frozen_offsets = true; | |
220 | vdata = arch_get_vdso_data(page_address(ns->vvar_page)); | |
221 | ||
222 | for (i = 0; i < CS_BASES; i++) | |
223 | timens_setup_vdso_data(&vdata[i], ns); | |
224 | ||
225 | out: | |
226 | mutex_unlock(&offset_lock); | |
227 | } | |
228 | ||
28c41efd | 229 | void free_time_ns(struct time_namespace *ns) |
769071ac | 230 | { |
769071ac AV |
231 | dec_time_namespaces(ns->ucounts); |
232 | put_user_ns(ns->user_ns); | |
233 | ns_free_inum(&ns->ns); | |
afaa7b5a | 234 | __free_page(ns->vvar_page); |
769071ac AV |
235 | kfree(ns); |
236 | } | |
237 | ||
238 | static struct time_namespace *to_time_ns(struct ns_common *ns) | |
239 | { | |
240 | return container_of(ns, struct time_namespace, ns); | |
241 | } | |
242 | ||
243 | static struct ns_common *timens_get(struct task_struct *task) | |
244 | { | |
245 | struct time_namespace *ns = NULL; | |
246 | struct nsproxy *nsproxy; | |
247 | ||
248 | task_lock(task); | |
249 | nsproxy = task->nsproxy; | |
250 | if (nsproxy) { | |
251 | ns = nsproxy->time_ns; | |
252 | get_time_ns(ns); | |
253 | } | |
254 | task_unlock(task); | |
255 | ||
256 | return ns ? &ns->ns : NULL; | |
257 | } | |
258 | ||
259 | static struct ns_common *timens_for_children_get(struct task_struct *task) | |
260 | { | |
261 | struct time_namespace *ns = NULL; | |
262 | struct nsproxy *nsproxy; | |
263 | ||
264 | task_lock(task); | |
265 | nsproxy = task->nsproxy; | |
266 | if (nsproxy) { | |
267 | ns = nsproxy->time_ns_for_children; | |
268 | get_time_ns(ns); | |
269 | } | |
270 | task_unlock(task); | |
271 | ||
272 | return ns ? &ns->ns : NULL; | |
273 | } | |
274 | ||
275 | static void timens_put(struct ns_common *ns) | |
276 | { | |
277 | put_time_ns(to_time_ns(ns)); | |
278 | } | |
279 | ||
76c12881 | 280 | void timens_commit(struct task_struct *tsk, struct time_namespace *ns) |
5cfea9a1 CB |
281 | { |
282 | timens_set_vvar_page(tsk, ns); | |
283 | vdso_join_timens(tsk, ns); | |
284 | } | |
285 | ||
f2a8d52e | 286 | static int timens_install(struct nsset *nsset, struct ns_common *new) |
769071ac | 287 | { |
f2a8d52e | 288 | struct nsproxy *nsproxy = nsset->nsproxy; |
769071ac AV |
289 | struct time_namespace *ns = to_time_ns(new); |
290 | ||
291 | if (!current_is_single_threaded()) | |
292 | return -EUSERS; | |
293 | ||
294 | if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || | |
f2a8d52e | 295 | !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) |
769071ac AV |
296 | return -EPERM; |
297 | ||
298 | get_time_ns(ns); | |
299 | put_time_ns(nsproxy->time_ns); | |
300 | nsproxy->time_ns = ns; | |
301 | ||
302 | get_time_ns(ns); | |
303 | put_time_ns(nsproxy->time_ns_for_children); | |
304 | nsproxy->time_ns_for_children = ns; | |
305 | return 0; | |
306 | } | |
307 | ||
5c62634f | 308 | void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) |
769071ac AV |
309 | { |
310 | struct ns_common *nsc = &nsproxy->time_ns_for_children->ns; | |
311 | struct time_namespace *ns = to_time_ns(nsc); | |
312 | ||
313 | /* create_new_namespaces() already incremented the ref counter */ | |
314 | if (nsproxy->time_ns == nsproxy->time_ns_for_children) | |
5c62634f | 315 | return; |
769071ac AV |
316 | |
317 | get_time_ns(ns); | |
318 | put_time_ns(nsproxy->time_ns); | |
319 | nsproxy->time_ns = ns; | |
320 | ||
5cfea9a1 | 321 | timens_commit(tsk, ns); |
769071ac AV |
322 | } |
323 | ||
324 | static struct user_namespace *timens_owner(struct ns_common *ns) | |
325 | { | |
326 | return to_time_ns(ns)->user_ns; | |
327 | } | |
328 | ||
04a8682a AV |
329 | static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts) |
330 | { | |
94d440d6 AV |
331 | char *clock; |
332 | ||
333 | switch (clockid) { | |
334 | case CLOCK_BOOTTIME: | |
335 | clock = "boottime"; | |
336 | break; | |
337 | case CLOCK_MONOTONIC: | |
338 | clock = "monotonic"; | |
339 | break; | |
340 | default: | |
341 | clock = "unknown"; | |
342 | break; | |
343 | } | |
344 | seq_printf(m, "%-10s %10lld %9ld\n", clock, ts->tv_sec, ts->tv_nsec); | |
04a8682a AV |
345 | } |
346 | ||
347 | void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m) | |
348 | { | |
349 | struct ns_common *ns; | |
350 | struct time_namespace *time_ns; | |
351 | ||
352 | ns = timens_for_children_get(p); | |
353 | if (!ns) | |
354 | return; | |
355 | time_ns = to_time_ns(ns); | |
356 | ||
357 | show_offset(m, CLOCK_MONOTONIC, &time_ns->offsets.monotonic); | |
358 | show_offset(m, CLOCK_BOOTTIME, &time_ns->offsets.boottime); | |
359 | put_time_ns(time_ns); | |
360 | } | |
361 | ||
362 | int proc_timens_set_offset(struct file *file, struct task_struct *p, | |
363 | struct proc_timens_offset *offsets, int noffsets) | |
364 | { | |
365 | struct ns_common *ns; | |
366 | struct time_namespace *time_ns; | |
367 | struct timespec64 tp; | |
368 | int i, err; | |
369 | ||
370 | ns = timens_for_children_get(p); | |
371 | if (!ns) | |
372 | return -ESRCH; | |
373 | time_ns = to_time_ns(ns); | |
374 | ||
375 | if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) { | |
376 | put_time_ns(time_ns); | |
377 | return -EPERM; | |
378 | } | |
379 | ||
380 | for (i = 0; i < noffsets; i++) { | |
381 | struct proc_timens_offset *off = &offsets[i]; | |
382 | ||
383 | switch (off->clockid) { | |
384 | case CLOCK_MONOTONIC: | |
385 | ktime_get_ts64(&tp); | |
386 | break; | |
387 | case CLOCK_BOOTTIME: | |
388 | ktime_get_boottime_ts64(&tp); | |
389 | break; | |
390 | default: | |
391 | err = -EINVAL; | |
392 | goto out; | |
393 | } | |
394 | ||
395 | err = -ERANGE; | |
396 | ||
397 | if (off->val.tv_sec > KTIME_SEC_MAX || | |
398 | off->val.tv_sec < -KTIME_SEC_MAX) | |
399 | goto out; | |
400 | ||
401 | tp = timespec64_add(tp, off->val); | |
402 | /* | |
403 | * KTIME_SEC_MAX is divided by 2 to be sure that KTIME_MAX is | |
404 | * still unreachable. | |
405 | */ | |
406 | if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2) | |
407 | goto out; | |
408 | } | |
409 | ||
410 | mutex_lock(&offset_lock); | |
411 | if (time_ns->frozen_offsets) { | |
412 | err = -EACCES; | |
413 | goto out_unlock; | |
414 | } | |
415 | ||
416 | err = 0; | |
417 | /* Don't report errors after this line */ | |
418 | for (i = 0; i < noffsets; i++) { | |
419 | struct proc_timens_offset *off = &offsets[i]; | |
420 | struct timespec64 *offset = NULL; | |
421 | ||
422 | switch (off->clockid) { | |
423 | case CLOCK_MONOTONIC: | |
424 | offset = &time_ns->offsets.monotonic; | |
425 | break; | |
426 | case CLOCK_BOOTTIME: | |
427 | offset = &time_ns->offsets.boottime; | |
428 | break; | |
429 | } | |
430 | ||
431 | *offset = off->val; | |
432 | } | |
433 | ||
434 | out_unlock: | |
435 | mutex_unlock(&offset_lock); | |
436 | out: | |
437 | put_time_ns(time_ns); | |
438 | ||
439 | return err; | |
440 | } | |
441 | ||
769071ac AV |
442 | const struct proc_ns_operations timens_operations = { |
443 | .name = "time", | |
444 | .type = CLONE_NEWTIME, | |
445 | .get = timens_get, | |
446 | .put = timens_put, | |
447 | .install = timens_install, | |
448 | .owner = timens_owner, | |
449 | }; | |
450 | ||
451 | const struct proc_ns_operations timens_for_children_operations = { | |
452 | .name = "time_for_children", | |
b801f1e2 | 453 | .real_ns_name = "time", |
769071ac AV |
454 | .type = CLONE_NEWTIME, |
455 | .get = timens_for_children_get, | |
456 | .put = timens_put, | |
457 | .install = timens_install, | |
458 | .owner = timens_owner, | |
459 | }; | |
460 | ||
461 | struct time_namespace init_time_ns = { | |
28c41efd | 462 | .ns.count = REFCOUNT_INIT(3), |
769071ac AV |
463 | .user_ns = &init_user_ns, |
464 | .ns.inum = PROC_TIME_INIT_INO, | |
465 | .ns.ops = &timens_operations, | |
afaa7b5a | 466 | .frozen_offsets = true, |
769071ac | 467 | }; |