]> git.proxmox.com Git - mirror_qemu.git/blame - tools/virtiofsd/passthrough_ll.c
virtiofsd: load_capng missing unlock
[mirror_qemu.git] / tools / virtiofsd / passthrough_ll.c
CommitLineData
7c6b6602 1/*
7387863d
DDAG
2 * FUSE: Filesystem in Userspace
3 * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
4 *
5 * This program can be distributed under the terms of the GNU GPLv2.
6 * See the file COPYING.
7 */
7c6b6602 8
7387863d 9/*
7c6b6602
DDAG
10 *
11 * This file system mirrors the existing file system hierarchy of the
12 * system, starting at the root file system. This is implemented by
13 * just "passing through" all requests to the corresponding user-space
14 * libc functions. In contrast to passthrough.c and passthrough_fh.c,
15 * this implementation uses the low-level API. Its performance should
16 * be the least bad among the three, but many operations are not
17 * implemented. In particular, it is not possible to remove files (or
18 * directories) because the code necessary to defer actual removal
19 * until the file is not opened anymore would make the example much
20 * more complicated.
21 *
22 * When writeback caching is enabled (-o writeback mount option), it
23 * is only possible to write to files for which the mounting user has
24 * read permissions. This is because the writeback cache requires the
25 * kernel to be able to issue read requests for all files (which the
26 * passthrough filesystem cannot satisfy if it can't read the file in
27 * the underlying filesystem).
28 *
29 * Compile with:
30 *
7387863d
DDAG
31 * gcc -Wall passthrough_ll.c `pkg-config fuse3 --cflags --libs` -o
32 * passthrough_ll
7c6b6602
DDAG
33 *
34 * ## Source code ##
35 * \include passthrough_ll.c
36 */
37
09863ebc 38#include "qemu/osdep.h"
50fb955a 39#include "qemu/timer.h"
f6f3573c 40#include "fuse_virtio.h"
d240314a 41#include "fuse_log.h"
09863ebc 42#include "fuse_lowlevel.h"
7c6b6602 43#include <assert.h>
2405f3c0 44#include <cap-ng.h>
7387863d 45#include <dirent.h>
7c6b6602 46#include <errno.h>
36f38469 47#include <glib.h>
7c6b6602 48#include <inttypes.h>
7387863d 49#include <limits.h>
7c6b6602 50#include <pthread.h>
7387863d
DDAG
51#include <stdbool.h>
52#include <stddef.h>
53#include <stdio.h>
54#include <stdlib.h>
55#include <string.h>
7c6b6602 56#include <sys/file.h>
5baa3b8e 57#include <sys/mount.h>
8e1d4ef2 58#include <sys/prctl.h>
01a6dc95 59#include <sys/resource.h>
929cfb7a 60#include <sys/syscall.h>
8e1d4ef2
SH
61#include <sys/types.h>
62#include <sys/wait.h>
7c6b6602 63#include <sys/xattr.h>
f185621d 64#include <syslog.h>
7387863d 65#include <unistd.h>
7c6b6602
DDAG
66
67#include "passthrough_helpers.h"
4f8bde99 68#include "seccomp.h"
7c6b6602 69
0e81414c
VG
70/* Keep track of inode posix locks for each owner. */
71struct lo_inode_plock {
72 uint64_t lock_owner;
73 int fd; /* fd for OFD locks */
74};
75
25c13572
SH
76struct lo_map_elem {
77 union {
92fb57b8 78 struct lo_inode *inode;
b39bce12 79 struct lo_dirp *dirp;
73b4d19d 80 int fd;
25c13572
SH
81 ssize_t freelist;
82 };
83 bool in_use;
84};
85
86/* Maps FUSE fh or ino values to internal objects */
87struct lo_map {
88 struct lo_map_elem *elems;
89 size_t nelems;
90 ssize_t freelist;
91};
92
bfc50a6e
MS
93struct lo_key {
94 ino_t ino;
95 dev_t dev;
96};
97
7c6b6602 98struct lo_inode {
7387863d 99 int fd;
c241aa94
SH
100
101 /*
102 * Atomic reference count for this object. The nlookup field holds a
103 * reference and release it when nlookup reaches 0.
104 */
105 gint refcount;
106
bfc50a6e 107 struct lo_key key;
1222f015
SH
108
109 /*
110 * This counter keeps the inode alive during the FUSE session.
111 * Incremented when the FUSE inode number is sent in a reply
112 * (FUSE_LOOKUP, FUSE_READDIRPLUS, etc). Decremented when an inode is
113 * released by requests like FUSE_FORGET, FUSE_RMDIR, FUSE_RENAME, etc.
114 *
115 * Note that this value is untrusted because the client can manipulate
116 * it arbitrarily using FUSE_FORGET requests.
117 *
118 * Protected by lo->mutex.
119 */
120 uint64_t nlookup;
121
92fb57b8 122 fuse_ino_t fuse_ino;
0e81414c
VG
123 pthread_mutex_t plock_mutex;
124 GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */
c241aa94
SH
125
126 bool is_symlink;
7c6b6602
DDAG
127};
128
929cfb7a
VG
129struct lo_cred {
130 uid_t euid;
131 gid_t egid;
132};
133
7c6b6602 134enum {
230e777b
MS
135 CACHE_NONE,
136 CACHE_AUTO,
7387863d 137 CACHE_ALWAYS,
7c6b6602
DDAG
138};
139
140struct lo_data {
7387863d
DDAG
141 pthread_mutex_t mutex;
142 int debug;
5fe319a7 143 int norace;
7387863d
DDAG
144 int writeback;
145 int flock;
0e81414c 146 int posix_lock;
7387863d 147 int xattr;
eb68a33b 148 char *source;
7387863d
DDAG
149 double timeout;
150 int cache;
151 int timeout_set;
59aef494
MS
152 int readdirplus_set;
153 int readdirplus_clear;
bfc50a6e
MS
154 struct lo_inode root;
155 GHashTable *inodes; /* protected by lo->mutex */
92fb57b8 156 struct lo_map ino_map; /* protected by lo->mutex */
b39bce12 157 struct lo_map dirp_map; /* protected by lo->mutex */
73b4d19d 158 struct lo_map fd_map; /* protected by lo->mutex */
9f59d175
SH
159
160 /* An O_PATH file descriptor to /proc/self/fd/ */
161 int proc_self_fd;
7c6b6602
DDAG
162};
163
164static const struct fuse_opt lo_opts[] = {
7387863d
DDAG
165 { "writeback", offsetof(struct lo_data, writeback), 1 },
166 { "no_writeback", offsetof(struct lo_data, writeback), 0 },
167 { "source=%s", offsetof(struct lo_data, source), 0 },
168 { "flock", offsetof(struct lo_data, flock), 1 },
169 { "no_flock", offsetof(struct lo_data, flock), 0 },
0e81414c
VG
170 { "posix_lock", offsetof(struct lo_data, posix_lock), 1 },
171 { "no_posix_lock", offsetof(struct lo_data, posix_lock), 0 },
7387863d
DDAG
172 { "xattr", offsetof(struct lo_data, xattr), 1 },
173 { "no_xattr", offsetof(struct lo_data, xattr), 0 },
174 { "timeout=%lf", offsetof(struct lo_data, timeout), 0 },
175 { "timeout=", offsetof(struct lo_data, timeout_set), 1 },
230e777b
MS
176 { "cache=none", offsetof(struct lo_data, cache), CACHE_NONE },
177 { "cache=auto", offsetof(struct lo_data, cache), CACHE_AUTO },
7387863d 178 { "cache=always", offsetof(struct lo_data, cache), CACHE_ALWAYS },
5fe319a7 179 { "norace", offsetof(struct lo_data, norace), 1 },
59aef494
MS
180 { "readdirplus", offsetof(struct lo_data, readdirplus_set), 1 },
181 { "no_readdirplus", offsetof(struct lo_data, readdirplus_clear), 1 },
7387863d 182 FUSE_OPT_END
7c6b6602 183};
f185621d 184static bool use_syslog = false;
d240314a 185static int current_log_level;
95d27157
MS
186static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
187 uint64_t n);
5fe319a7 188
2405f3c0
DDAG
189static struct {
190 pthread_mutex_t mutex;
191 void *saved;
192} cap;
193/* That we loaded cap-ng in the current thread from the saved */
194static __thread bool cap_loaded = 0;
195
5fe319a7
MS
196static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st);
197
25dae28c
SH
198static int is_dot_or_dotdot(const char *name)
199{
200 return name[0] == '.' &&
201 (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'));
202}
203
204/* Is `path` a single path component that is not "." or ".."? */
205static int is_safe_path_component(const char *path)
206{
207 if (strchr(path, '/')) {
208 return 0;
209 }
210
211 return !is_dot_or_dotdot(path);
212}
5fe319a7 213
7c6b6602
DDAG
214static struct lo_data *lo_data(fuse_req_t req)
215{
7387863d 216 return (struct lo_data *)fuse_req_userdata(req);
7c6b6602
DDAG
217}
218
2405f3c0
DDAG
219/*
220 * Load capng's state from our saved state if the current thread
221 * hadn't previously been loaded.
222 * returns 0 on success
223 */
224static int load_capng(void)
225{
226 if (!cap_loaded) {
227 pthread_mutex_lock(&cap.mutex);
228 capng_restore_state(&cap.saved);
229 /*
230 * restore_state free's the saved copy
231 * so make another.
232 */
233 cap.saved = capng_save_state();
234 if (!cap.saved) {
68639111 235 pthread_mutex_unlock(&cap.mutex);
2405f3c0
DDAG
236 fuse_log(FUSE_LOG_ERR, "capng_save_state (thread)\n");
237 return -EINVAL;
238 }
239 pthread_mutex_unlock(&cap.mutex);
240
241 /*
242 * We want to use the loaded state for our pid,
243 * not the original
244 */
245 capng_setpid(syscall(SYS_gettid));
246 cap_loaded = true;
247 }
248 return 0;
249}
250
ee884652
VG
251/*
252 * Helpers for dropping and regaining effective capabilities. Returns 0
253 * on success, error otherwise
254 */
255static int drop_effective_cap(const char *cap_name, bool *cap_dropped)
256{
257 int cap, ret;
258
259 cap = capng_name_to_capability(cap_name);
260 if (cap < 0) {
261 ret = errno;
262 fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n",
263 cap_name, strerror(errno));
264 goto out;
265 }
266
267 if (load_capng()) {
268 ret = errno;
269 fuse_log(FUSE_LOG_ERR, "load_capng() failed\n");
270 goto out;
271 }
272
273 /* We dont have this capability in effective set already. */
274 if (!capng_have_capability(CAPNG_EFFECTIVE, cap)) {
275 ret = 0;
276 goto out;
277 }
278
279 if (capng_update(CAPNG_DROP, CAPNG_EFFECTIVE, cap)) {
280 ret = errno;
281 fuse_log(FUSE_LOG_ERR, "capng_update(DROP,) failed\n");
282 goto out;
283 }
284
285 if (capng_apply(CAPNG_SELECT_CAPS)) {
286 ret = errno;
287 fuse_log(FUSE_LOG_ERR, "drop:capng_apply() failed\n");
288 goto out;
289 }
290
291 ret = 0;
292 if (cap_dropped) {
293 *cap_dropped = true;
294 }
295
296out:
297 return ret;
298}
299
300static int gain_effective_cap(const char *cap_name)
301{
302 int cap;
303 int ret = 0;
304
305 cap = capng_name_to_capability(cap_name);
306 if (cap < 0) {
307 ret = errno;
308 fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n",
309 cap_name, strerror(errno));
310 goto out;
311 }
312
313 if (load_capng()) {
314 ret = errno;
315 fuse_log(FUSE_LOG_ERR, "load_capng() failed\n");
316 goto out;
317 }
318
319 if (capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, cap)) {
320 ret = errno;
321 fuse_log(FUSE_LOG_ERR, "capng_update(ADD,) failed\n");
322 goto out;
323 }
324
325 if (capng_apply(CAPNG_SELECT_CAPS)) {
326 ret = errno;
327 fuse_log(FUSE_LOG_ERR, "gain:capng_apply() failed\n");
328 goto out;
329 }
330 ret = 0;
331
332out:
333 return ret;
334}
335
92fb57b8 336static void lo_map_init(struct lo_map *map)
25c13572
SH
337{
338 map->elems = NULL;
339 map->nelems = 0;
340 map->freelist = -1;
341}
342
92fb57b8 343static void lo_map_destroy(struct lo_map *map)
25c13572
SH
344{
345 free(map->elems);
346}
347
348static int lo_map_grow(struct lo_map *map, size_t new_nelems)
349{
350 struct lo_map_elem *new_elems;
351 size_t i;
352
353 if (new_nelems <= map->nelems) {
354 return 1;
355 }
356
357 new_elems = realloc(map->elems, sizeof(map->elems[0]) * new_nelems);
358 if (!new_elems) {
359 return 0;
360 }
361
362 for (i = map->nelems; i < new_nelems; i++) {
363 new_elems[i].freelist = i + 1;
364 new_elems[i].in_use = false;
365 }
366 new_elems[new_nelems - 1].freelist = -1;
367
368 map->elems = new_elems;
369 map->freelist = map->nelems;
370 map->nelems = new_nelems;
371 return 1;
372}
373
92fb57b8 374static struct lo_map_elem *lo_map_alloc_elem(struct lo_map *map)
25c13572
SH
375{
376 struct lo_map_elem *elem;
377
378 if (map->freelist == -1 && !lo_map_grow(map, map->nelems + 256)) {
379 return NULL;
380 }
381
382 elem = &map->elems[map->freelist];
383 map->freelist = elem->freelist;
384
385 elem->in_use = true;
386
387 return elem;
388}
389
92fb57b8 390static struct lo_map_elem *lo_map_reserve(struct lo_map *map, size_t key)
25c13572
SH
391{
392 ssize_t *prev;
393
394 if (!lo_map_grow(map, key + 1)) {
395 return NULL;
396 }
397
398 for (prev = &map->freelist; *prev != -1;
399 prev = &map->elems[*prev].freelist) {
400 if (*prev == key) {
401 struct lo_map_elem *elem = &map->elems[key];
402
403 *prev = elem->freelist;
404 elem->in_use = true;
405 return elem;
406 }
407 }
408 return NULL;
409}
410
92fb57b8 411static struct lo_map_elem *lo_map_get(struct lo_map *map, size_t key)
25c13572
SH
412{
413 if (key >= map->nelems) {
414 return NULL;
415 }
416 if (!map->elems[key].in_use) {
417 return NULL;
418 }
419 return &map->elems[key];
420}
421
92fb57b8 422static void lo_map_remove(struct lo_map *map, size_t key)
25c13572
SH
423{
424 struct lo_map_elem *elem;
425
426 if (key >= map->nelems) {
427 return;
428 }
429
430 elem = &map->elems[key];
431 if (!elem->in_use) {
432 return;
433 }
434
435 elem->in_use = false;
436
437 elem->freelist = map->freelist;
438 map->freelist = key;
439}
440
73b4d19d
SH
441/* Assumes lo->mutex is held */
442static ssize_t lo_add_fd_mapping(fuse_req_t req, int fd)
443{
444 struct lo_map_elem *elem;
445
446 elem = lo_map_alloc_elem(&lo_data(req)->fd_map);
447 if (!elem) {
448 return -1;
449 }
450
451 elem->fd = fd;
452 return elem - lo_data(req)->fd_map.elems;
453}
454
b39bce12
SH
455/* Assumes lo->mutex is held */
456static ssize_t lo_add_dirp_mapping(fuse_req_t req, struct lo_dirp *dirp)
457{
458 struct lo_map_elem *elem;
459
460 elem = lo_map_alloc_elem(&lo_data(req)->dirp_map);
461 if (!elem) {
462 return -1;
463 }
464
465 elem->dirp = dirp;
466 return elem - lo_data(req)->dirp_map.elems;
467}
468
92fb57b8
SH
469/* Assumes lo->mutex is held */
470static ssize_t lo_add_inode_mapping(fuse_req_t req, struct lo_inode *inode)
471{
472 struct lo_map_elem *elem;
473
474 elem = lo_map_alloc_elem(&lo_data(req)->ino_map);
475 if (!elem) {
476 return -1;
477 }
478
479 elem->inode = inode;
480 return elem - lo_data(req)->ino_map.elems;
481}
482
c241aa94
SH
483static void lo_inode_put(struct lo_data *lo, struct lo_inode **inodep)
484{
485 struct lo_inode *inode = *inodep;
486
487 if (!inode) {
488 return;
489 }
490
491 *inodep = NULL;
492
493 if (g_atomic_int_dec_and_test(&inode->refcount)) {
494 close(inode->fd);
495 free(inode);
496 }
497}
498
499/* Caller must release refcount using lo_inode_put() */
7c6b6602
DDAG
500static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino)
501{
92fb57b8
SH
502 struct lo_data *lo = lo_data(req);
503 struct lo_map_elem *elem;
504
505 pthread_mutex_lock(&lo->mutex);
506 elem = lo_map_get(&lo->ino_map, ino);
c241aa94
SH
507 if (elem) {
508 g_atomic_int_inc(&elem->inode->refcount);
509 }
92fb57b8
SH
510 pthread_mutex_unlock(&lo->mutex);
511
512 if (!elem) {
513 return NULL;
7387863d 514 }
92fb57b8
SH
515
516 return elem->inode;
7c6b6602
DDAG
517}
518
c241aa94
SH
519/*
520 * TODO Remove this helper and force callers to hold an inode refcount until
521 * they are done with the fd. This will be done in a later patch to make
522 * review easier.
523 */
7c6b6602
DDAG
524static int lo_fd(fuse_req_t req, fuse_ino_t ino)
525{
92fb57b8 526 struct lo_inode *inode = lo_inode(req, ino);
c241aa94
SH
527 int fd;
528
529 if (!inode) {
530 return -1;
531 }
532
533 fd = inode->fd;
534 lo_inode_put(lo_data(req), &inode);
535 return fd;
7c6b6602
DDAG
536}
537
7387863d 538static void lo_init(void *userdata, struct fuse_conn_info *conn)
7c6b6602 539{
7387863d
DDAG
540 struct lo_data *lo = (struct lo_data *)userdata;
541
542 if (conn->capable & FUSE_CAP_EXPORT_SUPPORT) {
543 conn->want |= FUSE_CAP_EXPORT_SUPPORT;
544 }
545
546 if (lo->writeback && conn->capable & FUSE_CAP_WRITEBACK_CACHE) {
d240314a 547 fuse_log(FUSE_LOG_DEBUG, "lo_init: activating writeback\n");
7387863d
DDAG
548 conn->want |= FUSE_CAP_WRITEBACK_CACHE;
549 }
e468d4af
PT
550 if (conn->capable & FUSE_CAP_FLOCK_LOCKS) {
551 if (lo->flock) {
552 fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n");
553 conn->want |= FUSE_CAP_FLOCK_LOCKS;
554 } else {
555 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling flock locks\n");
556 conn->want &= ~FUSE_CAP_FLOCK_LOCKS;
557 }
7387863d 558 }
0e81414c
VG
559
560 if (conn->capable & FUSE_CAP_POSIX_LOCKS) {
561 if (lo->posix_lock) {
562 fuse_log(FUSE_LOG_DEBUG, "lo_init: activating posix locks\n");
563 conn->want |= FUSE_CAP_POSIX_LOCKS;
564 } else {
565 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix locks\n");
566 conn->want &= ~FUSE_CAP_POSIX_LOCKS;
567 }
568 }
569
230e777b 570 if ((lo->cache == CACHE_NONE && !lo->readdirplus_set) ||
59aef494 571 lo->readdirplus_clear) {
ddcbabcb
MS
572 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n");
573 conn->want &= ~FUSE_CAP_READDIRPLUS;
574 }
7c6b6602
DDAG
575}
576
577static void lo_getattr(fuse_req_t req, fuse_ino_t ino,
7387863d 578 struct fuse_file_info *fi)
7c6b6602 579{
7387863d
DDAG
580 int res;
581 struct stat buf;
582 struct lo_data *lo = lo_data(req);
7c6b6602 583
7387863d 584 (void)fi;
7c6b6602 585
7387863d
DDAG
586 res =
587 fstatat(lo_fd(req, ino), "", &buf, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
588 if (res == -1) {
589 return (void)fuse_reply_err(req, errno);
590 }
7c6b6602 591
7387863d 592 fuse_reply_attr(req, &buf, lo->timeout);
7c6b6602
DDAG
593}
594
c241aa94
SH
595/*
596 * Increments parent->nlookup and caller must release refcount using
597 * lo_inode_put(&parent).
598 */
5fe319a7
MS
599static int lo_parent_and_name(struct lo_data *lo, struct lo_inode *inode,
600 char path[PATH_MAX], struct lo_inode **parent)
7c6b6602 601{
7387863d 602 char procname[64];
5fe319a7
MS
603 char *last;
604 struct stat stat;
605 struct lo_inode *p;
606 int retries = 2;
607 int res;
608
609retry:
9f59d175 610 sprintf(procname, "%i", inode->fd);
5fe319a7 611
9f59d175 612 res = readlinkat(lo->proc_self_fd, procname, path, PATH_MAX);
5fe319a7
MS
613 if (res < 0) {
614 fuse_log(FUSE_LOG_WARNING, "%s: readlink failed: %m\n", __func__);
615 goto fail_noretry;
616 }
617
618 if (res >= PATH_MAX) {
619 fuse_log(FUSE_LOG_WARNING, "%s: readlink overflowed\n", __func__);
620 goto fail_noretry;
621 }
622 path[res] = '\0';
623
624 last = strrchr(path, '/');
625 if (last == NULL) {
626 /* Shouldn't happen */
627 fuse_log(
628 FUSE_LOG_WARNING,
629 "%s: INTERNAL ERROR: bad path read from proc\n", __func__);
630 goto fail_noretry;
631 }
632 if (last == path) {
633 p = &lo->root;
634 pthread_mutex_lock(&lo->mutex);
1222f015 635 p->nlookup++;
c241aa94 636 g_atomic_int_inc(&p->refcount);
5fe319a7
MS
637 pthread_mutex_unlock(&lo->mutex);
638 } else {
639 *last = '\0';
640 res = fstatat(AT_FDCWD, last == path ? "/" : path, &stat, 0);
641 if (res == -1) {
642 if (!retries) {
643 fuse_log(FUSE_LOG_WARNING,
644 "%s: failed to stat parent: %m\n", __func__);
645 }
646 goto fail;
647 }
648 p = lo_find(lo, &stat);
649 if (p == NULL) {
650 if (!retries) {
651 fuse_log(FUSE_LOG_WARNING,
652 "%s: failed to find parent\n", __func__);
653 }
654 goto fail;
655 }
656 }
657 last++;
658 res = fstatat(p->fd, last, &stat, AT_SYMLINK_NOFOLLOW);
659 if (res == -1) {
660 if (!retries) {
661 fuse_log(FUSE_LOG_WARNING,
662 "%s: failed to stat last\n", __func__);
663 }
664 goto fail_unref;
665 }
bfc50a6e 666 if (stat.st_dev != inode->key.dev || stat.st_ino != inode->key.ino) {
5fe319a7
MS
667 if (!retries) {
668 fuse_log(FUSE_LOG_WARNING,
669 "%s: failed to match last\n", __func__);
670 }
671 goto fail_unref;
672 }
673 *parent = p;
674 memmove(path, last, strlen(last) + 1);
675
676 return 0;
677
678fail_unref:
95d27157 679 unref_inode_lolocked(lo, p, 1);
c241aa94 680 lo_inode_put(lo, &p);
5fe319a7
MS
681fail:
682 if (retries) {
683 retries--;
684 goto retry;
685 }
686fail_noretry:
687 errno = EIO;
688 return -1;
689}
690
691static int utimensat_empty(struct lo_data *lo, struct lo_inode *inode,
692 const struct timespec *tv)
693{
694 int res;
695 struct lo_inode *parent;
696 char path[PATH_MAX];
7387863d
DDAG
697
698 if (inode->is_symlink) {
5fe319a7 699 res = utimensat(inode->fd, "", tv, AT_EMPTY_PATH);
7387863d
DDAG
700 if (res == -1 && errno == EINVAL) {
701 /* Sorry, no race free way to set times on symlink. */
5fe319a7
MS
702 if (lo->norace) {
703 errno = EPERM;
704 } else {
705 goto fallback;
706 }
7387863d
DDAG
707 }
708 return res;
709 }
9f59d175 710 sprintf(path, "%i", inode->fd);
5fe319a7 711
9f59d175 712 return utimensat(lo->proc_self_fd, path, tv, 0);
7387863d 713
5fe319a7
MS
714fallback:
715 res = lo_parent_and_name(lo, inode, path, &parent);
716 if (res != -1) {
717 res = utimensat(parent->fd, path, tv, AT_SYMLINK_NOFOLLOW);
95d27157 718 unref_inode_lolocked(lo, parent, 1);
c241aa94 719 lo_inode_put(lo, &parent);
5fe319a7
MS
720 }
721
722 return res;
7c6b6602
DDAG
723}
724
73b4d19d
SH
725static int lo_fi_fd(fuse_req_t req, struct fuse_file_info *fi)
726{
727 struct lo_data *lo = lo_data(req);
728 struct lo_map_elem *elem;
729
730 pthread_mutex_lock(&lo->mutex);
731 elem = lo_map_get(&lo->fd_map, fi->fh);
732 pthread_mutex_unlock(&lo->mutex);
733
734 if (!elem) {
735 return -1;
736 }
737
738 return elem->fd;
739}
740
7c6b6602 741static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr,
7387863d 742 int valid, struct fuse_file_info *fi)
7c6b6602 743{
7387863d
DDAG
744 int saverr;
745 char procname[64];
5fe319a7 746 struct lo_data *lo = lo_data(req);
92fb57b8
SH
747 struct lo_inode *inode;
748 int ifd;
7387863d 749 int res;
73b4d19d 750 int fd;
7387863d 751
92fb57b8
SH
752 inode = lo_inode(req, ino);
753 if (!inode) {
754 fuse_reply_err(req, EBADF);
755 return;
756 }
757
758 ifd = inode->fd;
759
73b4d19d
SH
760 /* If fi->fh is invalid we'll report EBADF later */
761 if (fi) {
762 fd = lo_fi_fd(req, fi);
763 }
764
7387863d
DDAG
765 if (valid & FUSE_SET_ATTR_MODE) {
766 if (fi) {
73b4d19d 767 res = fchmod(fd, attr->st_mode);
7387863d 768 } else {
9f59d175
SH
769 sprintf(procname, "%i", ifd);
770 res = fchmodat(lo->proc_self_fd, procname, attr->st_mode, 0);
7387863d
DDAG
771 }
772 if (res == -1) {
773 goto out_err;
774 }
775 }
776 if (valid & (FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID)) {
777 uid_t uid = (valid & FUSE_SET_ATTR_UID) ? attr->st_uid : (uid_t)-1;
778 gid_t gid = (valid & FUSE_SET_ATTR_GID) ? attr->st_gid : (gid_t)-1;
779
780 res = fchownat(ifd, "", uid, gid, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
781 if (res == -1) {
782 goto out_err;
783 }
784 }
785 if (valid & FUSE_SET_ATTR_SIZE) {
9f59d175
SH
786 int truncfd;
787
7387863d 788 if (fi) {
9f59d175 789 truncfd = fd;
7387863d 790 } else {
9f59d175
SH
791 sprintf(procname, "%i", ifd);
792 truncfd = openat(lo->proc_self_fd, procname, O_RDWR);
793 if (truncfd < 0) {
794 goto out_err;
795 }
796 }
797
798 res = ftruncate(truncfd, attr->st_size);
799 if (!fi) {
800 saverr = errno;
801 close(truncfd);
802 errno = saverr;
7387863d
DDAG
803 }
804 if (res == -1) {
805 goto out_err;
806 }
807 }
808 if (valid & (FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME)) {
809 struct timespec tv[2];
810
811 tv[0].tv_sec = 0;
812 tv[1].tv_sec = 0;
813 tv[0].tv_nsec = UTIME_OMIT;
814 tv[1].tv_nsec = UTIME_OMIT;
815
816 if (valid & FUSE_SET_ATTR_ATIME_NOW) {
817 tv[0].tv_nsec = UTIME_NOW;
818 } else if (valid & FUSE_SET_ATTR_ATIME) {
819 tv[0] = attr->st_atim;
820 }
821
822 if (valid & FUSE_SET_ATTR_MTIME_NOW) {
823 tv[1].tv_nsec = UTIME_NOW;
824 } else if (valid & FUSE_SET_ATTR_MTIME) {
825 tv[1] = attr->st_mtim;
826 }
827
828 if (fi) {
73b4d19d 829 res = futimens(fd, tv);
7387863d 830 } else {
5fe319a7 831 res = utimensat_empty(lo, inode, tv);
7387863d
DDAG
832 }
833 if (res == -1) {
834 goto out_err;
835 }
836 }
c241aa94 837 lo_inode_put(lo, &inode);
7387863d
DDAG
838
839 return lo_getattr(req, ino, fi);
7c6b6602
DDAG
840
841out_err:
7387863d 842 saverr = errno;
c241aa94 843 lo_inode_put(lo, &inode);
7387863d 844 fuse_reply_err(req, saverr);
7c6b6602
DDAG
845}
846
847static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st)
848{
7387863d 849 struct lo_inode *p;
bfc50a6e
MS
850 struct lo_key key = {
851 .ino = st->st_ino,
852 .dev = st->st_dev,
853 };
7387863d
DDAG
854
855 pthread_mutex_lock(&lo->mutex);
bfc50a6e
MS
856 p = g_hash_table_lookup(lo->inodes, &key);
857 if (p) {
1222f015
SH
858 assert(p->nlookup > 0);
859 p->nlookup++;
c241aa94 860 g_atomic_int_inc(&p->refcount);
7387863d
DDAG
861 }
862 pthread_mutex_unlock(&lo->mutex);
bfc50a6e
MS
863
864 return p;
7c6b6602
DDAG
865}
866
0e81414c
VG
867/* value_destroy_func for posix_locks GHashTable */
868static void posix_locks_value_destroy(gpointer data)
869{
870 struct lo_inode_plock *plock = data;
871
872 /*
873 * We had used open() for locks and had only one fd. So
874 * closing this fd should release all OFD locks.
875 */
876 close(plock->fd);
877 free(plock);
878}
879
c241aa94
SH
880/*
881 * Increments nlookup and caller must release refcount using
882 * lo_inode_put(&parent).
883 */
7c6b6602 884static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name,
7387863d 885 struct fuse_entry_param *e)
7c6b6602 886{
7387863d
DDAG
887 int newfd;
888 int res;
889 int saverr;
890 struct lo_data *lo = lo_data(req);
c241aa94
SH
891 struct lo_inode *inode = NULL;
892 struct lo_inode *dir = lo_inode(req, parent);
7387863d 893
9de4fab5
MS
894 /*
895 * name_to_handle_at() and open_by_handle_at() can reach here with fuse
896 * mount point in guest, but we don't have its inode info in the
897 * ino_map.
898 */
899 if (!dir) {
900 return ENOENT;
901 }
902
7387863d
DDAG
903 memset(e, 0, sizeof(*e));
904 e->attr_timeout = lo->timeout;
905 e->entry_timeout = lo->timeout;
906
854684bc
SH
907 /* Do not allow escaping root directory */
908 if (dir == &lo->root && strcmp(name, "..") == 0) {
909 name = ".";
910 }
911
9de4fab5 912 newfd = openat(dir->fd, name, O_PATH | O_NOFOLLOW);
7387863d
DDAG
913 if (newfd == -1) {
914 goto out_err;
915 }
916
917 res = fstatat(newfd, "", &e->attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
918 if (res == -1) {
919 goto out_err;
920 }
921
9de4fab5 922 inode = lo_find(lo, &e->attr);
7387863d
DDAG
923 if (inode) {
924 close(newfd);
925 newfd = -1;
926 } else {
7387863d
DDAG
927 inode = calloc(1, sizeof(struct lo_inode));
928 if (!inode) {
929 goto out_err;
930 }
931
932 inode->is_symlink = S_ISLNK(e->attr.st_mode);
c241aa94
SH
933
934 /*
935 * One for the caller and one for nlookup (released in
936 * unref_inode_lolocked())
937 */
938 g_atomic_int_set(&inode->refcount, 2);
939
1222f015 940 inode->nlookup = 1;
7387863d 941 inode->fd = newfd;
9de4fab5 942 newfd = -1;
bfc50a6e
MS
943 inode->key.ino = e->attr.st_ino;
944 inode->key.dev = e->attr.st_dev;
0e81414c
VG
945 pthread_mutex_init(&inode->plock_mutex, NULL);
946 inode->posix_locks = g_hash_table_new_full(
947 g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy);
7387863d
DDAG
948
949 pthread_mutex_lock(&lo->mutex);
92fb57b8 950 inode->fuse_ino = lo_add_inode_mapping(req, inode);
bfc50a6e 951 g_hash_table_insert(lo->inodes, &inode->key, inode);
7387863d
DDAG
952 pthread_mutex_unlock(&lo->mutex);
953 }
92fb57b8 954 e->ino = inode->fuse_ino;
c241aa94
SH
955 lo_inode_put(lo, &inode);
956 lo_inode_put(lo, &dir);
7387863d 957
d240314a
EG
958 fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
959 name, (unsigned long long)e->ino);
7387863d
DDAG
960
961 return 0;
7c6b6602
DDAG
962
963out_err:
7387863d
DDAG
964 saverr = errno;
965 if (newfd != -1) {
966 close(newfd);
967 }
c241aa94
SH
968 lo_inode_put(lo, &inode);
969 lo_inode_put(lo, &dir);
7387863d 970 return saverr;
7c6b6602
DDAG
971}
972
973static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name)
974{
7387863d
DDAG
975 struct fuse_entry_param e;
976 int err;
977
d240314a
EG
978 fuse_log(FUSE_LOG_DEBUG, "lo_lookup(parent=%" PRIu64 ", name=%s)\n", parent,
979 name);
7387863d 980
25dae28c
SH
981 /*
982 * Don't use is_safe_path_component(), allow "." and ".." for NFS export
983 * support.
984 */
985 if (strchr(name, '/')) {
986 fuse_reply_err(req, EINVAL);
987 return;
988 }
989
7387863d
DDAG
990 err = lo_do_lookup(req, parent, name, &e);
991 if (err) {
992 fuse_reply_err(req, err);
993 } else {
994 fuse_reply_entry(req, &e);
995 }
7c6b6602
DDAG
996}
997
929cfb7a
VG
998/*
999 * On some archs, setres*id is limited to 2^16 but they
1000 * provide setres*id32 variants that allow 2^32.
1001 * Others just let setres*id do 2^32 anyway.
1002 */
1003#ifdef SYS_setresgid32
1004#define OURSYS_setresgid SYS_setresgid32
1005#else
1006#define OURSYS_setresgid SYS_setresgid
1007#endif
1008
1009#ifdef SYS_setresuid32
1010#define OURSYS_setresuid SYS_setresuid32
1011#else
1012#define OURSYS_setresuid SYS_setresuid
1013#endif
1014
1015/*
1016 * Change to uid/gid of caller so that file is created with
1017 * ownership of caller.
1018 * TODO: What about selinux context?
1019 */
1020static int lo_change_cred(fuse_req_t req, struct lo_cred *old)
1021{
1022 int res;
1023
1024 old->euid = geteuid();
1025 old->egid = getegid();
1026
1027 res = syscall(OURSYS_setresgid, -1, fuse_req_ctx(req)->gid, -1);
1028 if (res == -1) {
1029 return errno;
1030 }
1031
1032 res = syscall(OURSYS_setresuid, -1, fuse_req_ctx(req)->uid, -1);
1033 if (res == -1) {
1034 int errno_save = errno;
1035
1036 syscall(OURSYS_setresgid, -1, old->egid, -1);
1037 return errno_save;
1038 }
1039
1040 return 0;
1041}
1042
1043/* Regain Privileges */
1044static void lo_restore_cred(struct lo_cred *old)
1045{
1046 int res;
1047
1048 res = syscall(OURSYS_setresuid, -1, old->euid, -1);
1049 if (res == -1) {
1050 fuse_log(FUSE_LOG_ERR, "seteuid(%u): %m\n", old->euid);
1051 exit(1);
1052 }
1053
1054 res = syscall(OURSYS_setresgid, -1, old->egid, -1);
1055 if (res == -1) {
1056 fuse_log(FUSE_LOG_ERR, "setegid(%u): %m\n", old->egid);
1057 exit(1);
1058 }
1059}
1060
7c6b6602 1061static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent,
7387863d
DDAG
1062 const char *name, mode_t mode, dev_t rdev,
1063 const char *link)
7c6b6602 1064{
7387863d
DDAG
1065 int res;
1066 int saverr;
c241aa94 1067 struct lo_data *lo = lo_data(req);
92fb57b8 1068 struct lo_inode *dir;
7387863d 1069 struct fuse_entry_param e;
929cfb7a 1070 struct lo_cred old = {};
7c6b6602 1071
25dae28c
SH
1072 if (!is_safe_path_component(name)) {
1073 fuse_reply_err(req, EINVAL);
1074 return;
1075 }
1076
92fb57b8
SH
1077 dir = lo_inode(req, parent);
1078 if (!dir) {
1079 fuse_reply_err(req, EBADF);
1080 return;
1081 }
1082
7387863d 1083 saverr = ENOMEM;
7c6b6602 1084
929cfb7a
VG
1085 saverr = lo_change_cred(req, &old);
1086 if (saverr) {
1087 goto out;
1088 }
1089
7387863d 1090 res = mknod_wrapper(dir->fd, name, link, mode, rdev);
7c6b6602 1091
7387863d 1092 saverr = errno;
929cfb7a
VG
1093
1094 lo_restore_cred(&old);
1095
7387863d
DDAG
1096 if (res == -1) {
1097 goto out;
1098 }
7c6b6602 1099
7387863d
DDAG
1100 saverr = lo_do_lookup(req, parent, name, &e);
1101 if (saverr) {
1102 goto out;
1103 }
7c6b6602 1104
d240314a
EG
1105 fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
1106 name, (unsigned long long)e.ino);
7c6b6602 1107
7387863d 1108 fuse_reply_entry(req, &e);
c241aa94 1109 lo_inode_put(lo, &dir);
7387863d 1110 return;
7c6b6602
DDAG
1111
1112out:
c241aa94 1113 lo_inode_put(lo, &dir);
7387863d 1114 fuse_reply_err(req, saverr);
7c6b6602
DDAG
1115}
1116
7387863d
DDAG
1117static void lo_mknod(fuse_req_t req, fuse_ino_t parent, const char *name,
1118 mode_t mode, dev_t rdev)
7c6b6602 1119{
7387863d 1120 lo_mknod_symlink(req, parent, name, mode, rdev, NULL);
7c6b6602
DDAG
1121}
1122
1123static void lo_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name,
7387863d 1124 mode_t mode)
7c6b6602 1125{
7387863d 1126 lo_mknod_symlink(req, parent, name, S_IFDIR | mode, 0, NULL);
7c6b6602
DDAG
1127}
1128
7387863d
DDAG
1129static void lo_symlink(fuse_req_t req, const char *link, fuse_ino_t parent,
1130 const char *name)
7c6b6602 1131{
7387863d 1132 lo_mknod_symlink(req, parent, name, S_IFLNK, 0, link);
7c6b6602
DDAG
1133}
1134
5fe319a7
MS
1135static int linkat_empty_nofollow(struct lo_data *lo, struct lo_inode *inode,
1136 int dfd, const char *name)
7c6b6602 1137{
7387863d 1138 int res;
5fe319a7
MS
1139 struct lo_inode *parent;
1140 char path[PATH_MAX];
7c6b6602 1141
7387863d
DDAG
1142 if (inode->is_symlink) {
1143 res = linkat(inode->fd, "", dfd, name, AT_EMPTY_PATH);
1144 if (res == -1 && (errno == ENOENT || errno == EINVAL)) {
1145 /* Sorry, no race free way to hard-link a symlink. */
5fe319a7
MS
1146 if (lo->norace) {
1147 errno = EPERM;
1148 } else {
1149 goto fallback;
1150 }
7387863d
DDAG
1151 }
1152 return res;
1153 }
7c6b6602 1154
9f59d175 1155 sprintf(path, "%i", inode->fd);
5fe319a7 1156
9f59d175 1157 return linkat(lo->proc_self_fd, path, dfd, name, AT_SYMLINK_FOLLOW);
5fe319a7
MS
1158
1159fallback:
1160 res = lo_parent_and_name(lo, inode, path, &parent);
1161 if (res != -1) {
1162 res = linkat(parent->fd, path, dfd, name, 0);
95d27157 1163 unref_inode_lolocked(lo, parent, 1);
c241aa94 1164 lo_inode_put(lo, &parent);
5fe319a7 1165 }
7c6b6602 1166
5fe319a7 1167 return res;
7c6b6602
DDAG
1168}
1169
1170static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent,
7387863d 1171 const char *name)
7c6b6602 1172{
7387863d
DDAG
1173 int res;
1174 struct lo_data *lo = lo_data(req);
c241aa94 1175 struct lo_inode *parent_inode;
92fb57b8 1176 struct lo_inode *inode;
7387863d
DDAG
1177 struct fuse_entry_param e;
1178 int saverr;
1179
25dae28c
SH
1180 if (!is_safe_path_component(name)) {
1181 fuse_reply_err(req, EINVAL);
1182 return;
1183 }
1184
c241aa94 1185 parent_inode = lo_inode(req, parent);
92fb57b8 1186 inode = lo_inode(req, ino);
c241aa94
SH
1187 if (!parent_inode || !inode) {
1188 errno = EBADF;
1189 goto out_err;
92fb57b8
SH
1190 }
1191
7387863d
DDAG
1192 memset(&e, 0, sizeof(struct fuse_entry_param));
1193 e.attr_timeout = lo->timeout;
1194 e.entry_timeout = lo->timeout;
1195
c241aa94 1196 res = linkat_empty_nofollow(lo, inode, parent_inode->fd, name);
7387863d
DDAG
1197 if (res == -1) {
1198 goto out_err;
1199 }
1200
1201 res = fstatat(inode->fd, "", &e.attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1202 if (res == -1) {
1203 goto out_err;
1204 }
1205
1206 pthread_mutex_lock(&lo->mutex);
1222f015 1207 inode->nlookup++;
7387863d 1208 pthread_mutex_unlock(&lo->mutex);
92fb57b8 1209 e.ino = inode->fuse_ino;
7387863d 1210
d240314a
EG
1211 fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
1212 name, (unsigned long long)e.ino);
7387863d
DDAG
1213
1214 fuse_reply_entry(req, &e);
c241aa94
SH
1215 lo_inode_put(lo, &parent_inode);
1216 lo_inode_put(lo, &inode);
7387863d 1217 return;
7c6b6602
DDAG
1218
1219out_err:
7387863d 1220 saverr = errno;
c241aa94
SH
1221 lo_inode_put(lo, &parent_inode);
1222 lo_inode_put(lo, &inode);
7387863d 1223 fuse_reply_err(req, saverr);
7c6b6602
DDAG
1224}
1225
c241aa94 1226/* Increments nlookup and caller must release refcount using lo_inode_put() */
9257e514
MS
1227static struct lo_inode *lookup_name(fuse_req_t req, fuse_ino_t parent,
1228 const char *name)
1229{
1230 int res;
1231 struct stat attr;
1232
1233 res = fstatat(lo_fd(req, parent), name, &attr,
1234 AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1235 if (res == -1) {
1236 return NULL;
1237 }
1238
1239 return lo_find(lo_data(req), &attr);
1240}
1241
7c6b6602
DDAG
1242static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name)
1243{
7387863d 1244 int res;
9257e514
MS
1245 struct lo_inode *inode;
1246 struct lo_data *lo = lo_data(req);
1247
25dae28c
SH
1248 if (!is_safe_path_component(name)) {
1249 fuse_reply_err(req, EINVAL);
1250 return;
1251 }
7c6b6602 1252
9257e514
MS
1253 inode = lookup_name(req, parent, name);
1254 if (!inode) {
1255 fuse_reply_err(req, EIO);
1256 return;
1257 }
1258
7387863d 1259 res = unlinkat(lo_fd(req, parent), name, AT_REMOVEDIR);
7c6b6602 1260
7387863d 1261 fuse_reply_err(req, res == -1 ? errno : 0);
9257e514 1262 unref_inode_lolocked(lo, inode, 1);
c241aa94 1263 lo_inode_put(lo, &inode);
7c6b6602
DDAG
1264}
1265
1266static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name,
7387863d
DDAG
1267 fuse_ino_t newparent, const char *newname,
1268 unsigned int flags)
7c6b6602 1269{
7387863d 1270 int res;
c241aa94
SH
1271 struct lo_inode *parent_inode;
1272 struct lo_inode *newparent_inode;
1273 struct lo_inode *oldinode = NULL;
1274 struct lo_inode *newinode = NULL;
9257e514 1275 struct lo_data *lo = lo_data(req);
7c6b6602 1276
25dae28c
SH
1277 if (!is_safe_path_component(name) || !is_safe_path_component(newname)) {
1278 fuse_reply_err(req, EINVAL);
1279 return;
1280 }
1281
c241aa94
SH
1282 parent_inode = lo_inode(req, parent);
1283 newparent_inode = lo_inode(req, newparent);
1284 if (!parent_inode || !newparent_inode) {
1285 fuse_reply_err(req, EBADF);
1286 goto out;
1287 }
1288
9257e514
MS
1289 oldinode = lookup_name(req, parent, name);
1290 newinode = lookup_name(req, newparent, newname);
1291
1292 if (!oldinode) {
1293 fuse_reply_err(req, EIO);
1294 goto out;
1295 }
1296
7387863d 1297 if (flags) {
f0ab7d6f 1298#ifndef SYS_renameat2
7387863d 1299 fuse_reply_err(req, EINVAL);
f0ab7d6f 1300#else
c241aa94
SH
1301 res = syscall(SYS_renameat2, parent_inode->fd, name,
1302 newparent_inode->fd, newname, flags);
f0ab7d6f
MS
1303 if (res == -1 && errno == ENOSYS) {
1304 fuse_reply_err(req, EINVAL);
1305 } else {
1306 fuse_reply_err(req, res == -1 ? errno : 0);
1307 }
1308#endif
9257e514 1309 goto out;
7387863d 1310 }
7c6b6602 1311
c241aa94 1312 res = renameat(parent_inode->fd, name, newparent_inode->fd, newname);
7c6b6602 1313
7387863d 1314 fuse_reply_err(req, res == -1 ? errno : 0);
9257e514
MS
1315out:
1316 unref_inode_lolocked(lo, oldinode, 1);
1317 unref_inode_lolocked(lo, newinode, 1);
c241aa94
SH
1318 lo_inode_put(lo, &oldinode);
1319 lo_inode_put(lo, &newinode);
1320 lo_inode_put(lo, &parent_inode);
1321 lo_inode_put(lo, &newparent_inode);
7c6b6602
DDAG
1322}
1323
1324static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name)
1325{
7387863d 1326 int res;
9257e514
MS
1327 struct lo_inode *inode;
1328 struct lo_data *lo = lo_data(req);
7c6b6602 1329
25dae28c
SH
1330 if (!is_safe_path_component(name)) {
1331 fuse_reply_err(req, EINVAL);
1332 return;
1333 }
1334
9257e514
MS
1335 inode = lookup_name(req, parent, name);
1336 if (!inode) {
1337 fuse_reply_err(req, EIO);
1338 return;
1339 }
1340
7387863d 1341 res = unlinkat(lo_fd(req, parent), name, 0);
7c6b6602 1342
7387863d 1343 fuse_reply_err(req, res == -1 ? errno : 0);
9257e514 1344 unref_inode_lolocked(lo, inode, 1);
c241aa94 1345 lo_inode_put(lo, &inode);
7c6b6602
DDAG
1346}
1347
fe4c1579
DDAG
1348/* To be called with lo->mutex held */
1349static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n)
7c6b6602 1350{
7387863d
DDAG
1351 if (!inode) {
1352 return;
1353 }
1354
1222f015
SH
1355 assert(inode->nlookup >= n);
1356 inode->nlookup -= n;
1357 if (!inode->nlookup) {
92fb57b8 1358 lo_map_remove(&lo->ino_map, inode->fuse_ino);
bfc50a6e 1359 g_hash_table_remove(lo->inodes, &inode->key);
0e81414c
VG
1360 if (g_hash_table_size(inode->posix_locks)) {
1361 fuse_log(FUSE_LOG_WARNING, "Hash table is not empty\n");
1362 }
1363 g_hash_table_destroy(inode->posix_locks);
1364 pthread_mutex_destroy(&inode->plock_mutex);
c241aa94
SH
1365
1366 /* Drop our refcount from lo_do_lookup() */
1367 lo_inode_put(lo, &inode);
7387863d 1368 }
7c6b6602
DDAG
1369}
1370
fe4c1579
DDAG
1371static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
1372 uint64_t n)
1373{
1374 if (!inode) {
1375 return;
1376 }
1377
1378 pthread_mutex_lock(&lo->mutex);
1379 unref_inode(lo, inode, n);
1380 pthread_mutex_unlock(&lo->mutex);
1381}
1382
7c6b6602
DDAG
1383static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup)
1384{
7387863d 1385 struct lo_data *lo = lo_data(req);
92fb57b8
SH
1386 struct lo_inode *inode;
1387
1388 inode = lo_inode(req, ino);
1389 if (!inode) {
1390 return;
1391 }
7c6b6602 1392
d240314a 1393 fuse_log(FUSE_LOG_DEBUG, " forget %lli %lli -%lli\n",
1222f015 1394 (unsigned long long)ino, (unsigned long long)inode->nlookup,
d240314a 1395 (unsigned long long)nlookup);
7c6b6602 1396
95d27157 1397 unref_inode_lolocked(lo, inode, nlookup);
c241aa94 1398 lo_inode_put(lo, &inode);
7c6b6602
DDAG
1399}
1400
1401static void lo_forget(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup)
1402{
7387863d
DDAG
1403 lo_forget_one(req, ino, nlookup);
1404 fuse_reply_none(req);
7c6b6602
DDAG
1405}
1406
1407static void lo_forget_multi(fuse_req_t req, size_t count,
7387863d 1408 struct fuse_forget_data *forgets)
7c6b6602 1409{
7387863d 1410 int i;
7c6b6602 1411
7387863d
DDAG
1412 for (i = 0; i < count; i++) {
1413 lo_forget_one(req, forgets[i].ino, forgets[i].nlookup);
1414 }
1415 fuse_reply_none(req);
7c6b6602
DDAG
1416}
1417
1418static void lo_readlink(fuse_req_t req, fuse_ino_t ino)
1419{
7387863d
DDAG
1420 char buf[PATH_MAX + 1];
1421 int res;
7c6b6602 1422
7387863d
DDAG
1423 res = readlinkat(lo_fd(req, ino), "", buf, sizeof(buf));
1424 if (res == -1) {
1425 return (void)fuse_reply_err(req, errno);
1426 }
7c6b6602 1427
7387863d
DDAG
1428 if (res == sizeof(buf)) {
1429 return (void)fuse_reply_err(req, ENAMETOOLONG);
1430 }
7c6b6602 1431
7387863d 1432 buf[res] = '\0';
7c6b6602 1433
7387863d 1434 fuse_reply_readlink(req, buf);
7c6b6602
DDAG
1435}
1436
1437struct lo_dirp {
acefdde7 1438 gint refcount;
7387863d
DDAG
1439 DIR *dp;
1440 struct dirent *entry;
1441 off_t offset;
7c6b6602
DDAG
1442};
1443
acefdde7
SH
1444static void lo_dirp_put(struct lo_dirp **dp)
1445{
1446 struct lo_dirp *d = *dp;
1447
1448 if (!d) {
1449 return;
1450 }
1451 *dp = NULL;
1452
1453 if (g_atomic_int_dec_and_test(&d->refcount)) {
1454 closedir(d->dp);
1455 free(d);
1456 }
1457}
1458
1459/* Call lo_dirp_put() on the return value when no longer needed */
b39bce12 1460static struct lo_dirp *lo_dirp(fuse_req_t req, struct fuse_file_info *fi)
7c6b6602 1461{
b39bce12
SH
1462 struct lo_data *lo = lo_data(req);
1463 struct lo_map_elem *elem;
1464
1465 pthread_mutex_lock(&lo->mutex);
1466 elem = lo_map_get(&lo->dirp_map, fi->fh);
acefdde7
SH
1467 if (elem) {
1468 g_atomic_int_inc(&elem->dirp->refcount);
1469 }
b39bce12
SH
1470 pthread_mutex_unlock(&lo->mutex);
1471 if (!elem) {
1472 return NULL;
1473 }
1474
1475 return elem->dirp;
7c6b6602
DDAG
1476}
1477
7387863d
DDAG
1478static void lo_opendir(fuse_req_t req, fuse_ino_t ino,
1479 struct fuse_file_info *fi)
7c6b6602 1480{
7387863d
DDAG
1481 int error = ENOMEM;
1482 struct lo_data *lo = lo_data(req);
1483 struct lo_dirp *d;
1484 int fd;
b39bce12 1485 ssize_t fh;
7387863d
DDAG
1486
1487 d = calloc(1, sizeof(struct lo_dirp));
1488 if (d == NULL) {
1489 goto out_err;
1490 }
1491
1492 fd = openat(lo_fd(req, ino), ".", O_RDONLY);
1493 if (fd == -1) {
1494 goto out_errno;
1495 }
1496
1497 d->dp = fdopendir(fd);
1498 if (d->dp == NULL) {
1499 goto out_errno;
1500 }
1501
1502 d->offset = 0;
1503 d->entry = NULL;
1504
acefdde7 1505 g_atomic_int_set(&d->refcount, 1); /* paired with lo_releasedir() */
b39bce12
SH
1506 pthread_mutex_lock(&lo->mutex);
1507 fh = lo_add_dirp_mapping(req, d);
1508 pthread_mutex_unlock(&lo->mutex);
1509 if (fh == -1) {
1510 goto out_err;
1511 }
1512
1513 fi->fh = fh;
7387863d 1514 if (lo->cache == CACHE_ALWAYS) {
9b610b09 1515 fi->cache_readdir = 1;
7387863d
DDAG
1516 }
1517 fuse_reply_open(req, fi);
1518 return;
7c6b6602
DDAG
1519
1520out_errno:
7387863d 1521 error = errno;
7c6b6602 1522out_err:
7387863d 1523 if (d) {
b39bce12
SH
1524 if (d->dp) {
1525 closedir(d->dp);
1526 }
7387863d
DDAG
1527 if (fd != -1) {
1528 close(fd);
1529 }
1530 free(d);
1531 }
1532 fuse_reply_err(req, error);
7c6b6602
DDAG
1533}
1534
7c6b6602 1535static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
7387863d 1536 off_t offset, struct fuse_file_info *fi, int plus)
7c6b6602 1537{
752272da 1538 struct lo_data *lo = lo_data(req);
acefdde7 1539 struct lo_dirp *d = NULL;
752272da 1540 struct lo_inode *dinode;
b39bce12 1541 char *buf = NULL;
7387863d
DDAG
1542 char *p;
1543 size_t rem = size;
752272da 1544 int err = EBADF;
7387863d 1545
752272da
SH
1546 dinode = lo_inode(req, ino);
1547 if (!dinode) {
1548 goto error;
1549 }
7387863d 1550
b39bce12
SH
1551 d = lo_dirp(req, fi);
1552 if (!d) {
1553 goto error;
1554 }
1555
752272da 1556 err = ENOMEM;
7387863d
DDAG
1557 buf = calloc(1, size);
1558 if (!buf) {
7387863d
DDAG
1559 goto error;
1560 }
1561 p = buf;
1562
1563 if (offset != d->offset) {
1564 seekdir(d->dp, offset);
1565 d->entry = NULL;
1566 d->offset = offset;
1567 }
1568 while (1) {
1569 size_t entsize;
1570 off_t nextoff;
1571 const char *name;
1572
1573 if (!d->entry) {
1574 errno = 0;
1575 d->entry = readdir(d->dp);
1576 if (!d->entry) {
1577 if (errno) { /* Error */
1578 err = errno;
1579 goto error;
1580 } else { /* End of stream */
1581 break;
1582 }
1583 }
1584 }
1585 nextoff = d->entry->d_off;
1586 name = d->entry->d_name;
752272da 1587
7387863d 1588 fuse_ino_t entry_ino = 0;
752272da
SH
1589 struct fuse_entry_param e = (struct fuse_entry_param){
1590 .attr.st_ino = d->entry->d_ino,
1591 .attr.st_mode = d->entry->d_type << 12,
1592 };
1593
1594 /* Hide root's parent directory */
1595 if (dinode == &lo->root && strcmp(name, "..") == 0) {
bfc50a6e 1596 e.attr.st_ino = lo->root.key.ino;
752272da
SH
1597 e.attr.st_mode = DT_DIR << 12;
1598 }
1599
7387863d 1600 if (plus) {
752272da 1601 if (!is_dot_or_dotdot(name)) {
7387863d
DDAG
1602 err = lo_do_lookup(req, ino, name, &e);
1603 if (err) {
1604 goto error;
1605 }
1606 entry_ino = e.ino;
1607 }
1608
1609 entsize = fuse_add_direntry_plus(req, p, rem, name, &e, nextoff);
1610 } else {
752272da 1611 entsize = fuse_add_direntry(req, p, rem, name, &e.attr, nextoff);
7387863d
DDAG
1612 }
1613 if (entsize > rem) {
1614 if (entry_ino != 0) {
1615 lo_forget_one(req, entry_ino, 1);
1616 }
1617 break;
1618 }
1619
1620 p += entsize;
1621 rem -= entsize;
1622
1623 d->entry = NULL;
1624 d->offset = nextoff;
1625 }
7c6b6602
DDAG
1626
1627 err = 0;
1628error:
acefdde7 1629 lo_dirp_put(&d);
c241aa94 1630 lo_inode_put(lo, &dinode);
acefdde7 1631
7387863d
DDAG
1632 /*
1633 * If there's an error, we can only signal it if we haven't stored
1634 * any entries yet - otherwise we'd end up with wrong lookup
1635 * counts for the entries that are already in the buffer. So we
1636 * return what we've collected until that point.
1637 */
1638 if (err && rem == size) {
1639 fuse_reply_err(req, err);
1640 } else {
1641 fuse_reply_buf(req, buf, size - rem);
1642 }
7c6b6602
DDAG
1643 free(buf);
1644}
1645
1646static void lo_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
7387863d 1647 off_t offset, struct fuse_file_info *fi)
7c6b6602 1648{
7387863d 1649 lo_do_readdir(req, ino, size, offset, fi, 0);
7c6b6602
DDAG
1650}
1651
1652static void lo_readdirplus(fuse_req_t req, fuse_ino_t ino, size_t size,
7387863d 1653 off_t offset, struct fuse_file_info *fi)
7c6b6602 1654{
7387863d 1655 lo_do_readdir(req, ino, size, offset, fi, 1);
7c6b6602
DDAG
1656}
1657
7387863d
DDAG
1658static void lo_releasedir(fuse_req_t req, fuse_ino_t ino,
1659 struct fuse_file_info *fi)
7c6b6602 1660{
b39bce12 1661 struct lo_data *lo = lo_data(req);
acefdde7 1662 struct lo_map_elem *elem;
b39bce12
SH
1663 struct lo_dirp *d;
1664
7387863d 1665 (void)ino;
b39bce12 1666
acefdde7
SH
1667 pthread_mutex_lock(&lo->mutex);
1668 elem = lo_map_get(&lo->dirp_map, fi->fh);
1669 if (!elem) {
1670 pthread_mutex_unlock(&lo->mutex);
b39bce12
SH
1671 fuse_reply_err(req, EBADF);
1672 return;
1673 }
1674
acefdde7 1675 d = elem->dirp;
b39bce12
SH
1676 lo_map_remove(&lo->dirp_map, fi->fh);
1677 pthread_mutex_unlock(&lo->mutex);
1678
acefdde7
SH
1679 lo_dirp_put(&d); /* paired with lo_opendir() */
1680
7387863d 1681 fuse_reply_err(req, 0);
7c6b6602
DDAG
1682}
1683
8e4e41e3
MT
1684static void update_open_flags(int writeback, struct fuse_file_info *fi)
1685{
1686 /*
1687 * With writeback cache, kernel may send read requests even
1688 * when userspace opened write-only
1689 */
1690 if (writeback && (fi->flags & O_ACCMODE) == O_WRONLY) {
1691 fi->flags &= ~O_ACCMODE;
1692 fi->flags |= O_RDWR;
1693 }
1694
1695 /*
1696 * With writeback cache, O_APPEND is handled by the kernel.
1697 * This breaks atomicity (since the file may change in the
1698 * underlying filesystem, so that the kernel's idea of the
1699 * end of the file isn't accurate anymore). In this example,
1700 * we just accept that. A more rigorous filesystem may want
1701 * to return an error here
1702 */
1703 if (writeback && (fi->flags & O_APPEND)) {
1704 fi->flags &= ~O_APPEND;
1705 }
1706
1707 /*
1708 * O_DIRECT in guest should not necessarily mean bypassing page
1709 * cache on host as well. If somebody needs that behavior, it
1710 * probably should be a configuration knob in daemon.
1711 */
1712 fi->flags &= ~O_DIRECT;
1713}
1714
7c6b6602 1715static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name,
7387863d 1716 mode_t mode, struct fuse_file_info *fi)
7c6b6602 1717{
7387863d
DDAG
1718 int fd;
1719 struct lo_data *lo = lo_data(req);
c241aa94 1720 struct lo_inode *parent_inode;
7387863d
DDAG
1721 struct fuse_entry_param e;
1722 int err;
929cfb7a 1723 struct lo_cred old = {};
7387863d 1724
d240314a
EG
1725 fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)\n", parent,
1726 name);
7387863d 1727
25dae28c
SH
1728 if (!is_safe_path_component(name)) {
1729 fuse_reply_err(req, EINVAL);
1730 return;
1731 }
1732
c241aa94
SH
1733 parent_inode = lo_inode(req, parent);
1734 if (!parent_inode) {
1735 fuse_reply_err(req, EBADF);
1736 return;
1737 }
1738
929cfb7a
VG
1739 err = lo_change_cred(req, &old);
1740 if (err) {
1741 goto out;
1742 }
1743
8e4e41e3 1744 update_open_flags(lo->writeback, fi);
65da4539 1745
c241aa94 1746 fd = openat(parent_inode->fd, name, (fi->flags | O_CREAT) & ~O_NOFOLLOW,
7387863d 1747 mode);
929cfb7a
VG
1748 err = fd == -1 ? errno : 0;
1749 lo_restore_cred(&old);
7387863d 1750
929cfb7a 1751 if (!err) {
73b4d19d
SH
1752 ssize_t fh;
1753
1754 pthread_mutex_lock(&lo->mutex);
1755 fh = lo_add_fd_mapping(req, fd);
1756 pthread_mutex_unlock(&lo->mutex);
1757 if (fh == -1) {
1758 close(fd);
c241aa94
SH
1759 err = ENOMEM;
1760 goto out;
73b4d19d
SH
1761 }
1762
1763 fi->fh = fh;
929cfb7a
VG
1764 err = lo_do_lookup(req, parent, name, &e);
1765 }
230e777b 1766 if (lo->cache == CACHE_NONE) {
7387863d
DDAG
1767 fi->direct_io = 1;
1768 } else if (lo->cache == CACHE_ALWAYS) {
1769 fi->keep_cache = 1;
1770 }
1771
929cfb7a 1772out:
c241aa94
SH
1773 lo_inode_put(lo, &parent_inode);
1774
7387863d
DDAG
1775 if (err) {
1776 fuse_reply_err(req, err);
1777 } else {
1778 fuse_reply_create(req, &e, fi);
1779 }
7c6b6602
DDAG
1780}
1781
0e81414c
VG
1782/* Should be called with inode->plock_mutex held */
1783static struct lo_inode_plock *lookup_create_plock_ctx(struct lo_data *lo,
1784 struct lo_inode *inode,
1785 uint64_t lock_owner,
1786 pid_t pid, int *err)
1787{
1788 struct lo_inode_plock *plock;
1789 char procname[64];
1790 int fd;
1791
1792 plock =
1793 g_hash_table_lookup(inode->posix_locks, GUINT_TO_POINTER(lock_owner));
1794
1795 if (plock) {
1796 return plock;
1797 }
1798
1799 plock = malloc(sizeof(struct lo_inode_plock));
1800 if (!plock) {
1801 *err = ENOMEM;
1802 return NULL;
1803 }
1804
1805 /* Open another instance of file which can be used for ofd locks. */
1806 sprintf(procname, "%i", inode->fd);
1807
1808 /* TODO: What if file is not writable? */
1809 fd = openat(lo->proc_self_fd, procname, O_RDWR);
1810 if (fd == -1) {
1811 *err = errno;
1812 free(plock);
1813 return NULL;
1814 }
1815
1816 plock->lock_owner = lock_owner;
1817 plock->fd = fd;
1818 g_hash_table_insert(inode->posix_locks, GUINT_TO_POINTER(plock->lock_owner),
1819 plock);
1820 return plock;
1821}
1822
1823static void lo_getlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
1824 struct flock *lock)
1825{
1826 struct lo_data *lo = lo_data(req);
1827 struct lo_inode *inode;
1828 struct lo_inode_plock *plock;
1829 int ret, saverr = 0;
1830
1831 fuse_log(FUSE_LOG_DEBUG,
1832 "lo_getlk(ino=%" PRIu64 ", flags=%d)"
1833 " owner=0x%lx, l_type=%d l_start=0x%lx"
1834 " l_len=0x%lx\n",
1835 ino, fi->flags, fi->lock_owner, lock->l_type, lock->l_start,
1836 lock->l_len);
1837
1838 inode = lo_inode(req, ino);
1839 if (!inode) {
1840 fuse_reply_err(req, EBADF);
1841 return;
1842 }
1843
1844 pthread_mutex_lock(&inode->plock_mutex);
1845 plock =
1846 lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret);
1847 if (!plock) {
c241aa94
SH
1848 saverr = ret;
1849 goto out;
0e81414c
VG
1850 }
1851
1852 ret = fcntl(plock->fd, F_OFD_GETLK, lock);
1853 if (ret == -1) {
1854 saverr = errno;
1855 }
c241aa94
SH
1856
1857out:
0e81414c 1858 pthread_mutex_unlock(&inode->plock_mutex);
c241aa94 1859 lo_inode_put(lo, &inode);
0e81414c
VG
1860
1861 if (saverr) {
1862 fuse_reply_err(req, saverr);
1863 } else {
1864 fuse_reply_lock(req, lock);
1865 }
1866}
1867
1868static void lo_setlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
1869 struct flock *lock, int sleep)
1870{
1871 struct lo_data *lo = lo_data(req);
1872 struct lo_inode *inode;
1873 struct lo_inode_plock *plock;
1874 int ret, saverr = 0;
1875
1876 fuse_log(FUSE_LOG_DEBUG,
1877 "lo_setlk(ino=%" PRIu64 ", flags=%d)"
1878 " cmd=%d pid=%d owner=0x%lx sleep=%d l_whence=%d"
1879 " l_start=0x%lx l_len=0x%lx\n",
1880 ino, fi->flags, lock->l_type, lock->l_pid, fi->lock_owner, sleep,
1881 lock->l_whence, lock->l_start, lock->l_len);
1882
1883 if (sleep) {
1884 fuse_reply_err(req, EOPNOTSUPP);
1885 return;
1886 }
1887
1888 inode = lo_inode(req, ino);
1889 if (!inode) {
1890 fuse_reply_err(req, EBADF);
1891 return;
1892 }
1893
1894 pthread_mutex_lock(&inode->plock_mutex);
1895 plock =
1896 lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret);
1897
1898 if (!plock) {
c241aa94
SH
1899 saverr = ret;
1900 goto out;
0e81414c
VG
1901 }
1902
1903 /* TODO: Is it alright to modify flock? */
1904 lock->l_pid = 0;
1905 ret = fcntl(plock->fd, F_OFD_SETLK, lock);
1906 if (ret == -1) {
1907 saverr = errno;
1908 }
c241aa94
SH
1909
1910out:
0e81414c 1911 pthread_mutex_unlock(&inode->plock_mutex);
c241aa94
SH
1912 lo_inode_put(lo, &inode);
1913
0e81414c
VG
1914 fuse_reply_err(req, saverr);
1915}
1916
7c6b6602 1917static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync,
7387863d 1918 struct fuse_file_info *fi)
7c6b6602 1919{
7387863d 1920 int res;
b39bce12
SH
1921 struct lo_dirp *d;
1922 int fd;
1923
7387863d 1924 (void)ino;
b39bce12
SH
1925
1926 d = lo_dirp(req, fi);
1927 if (!d) {
1928 fuse_reply_err(req, EBADF);
1929 return;
1930 }
1931
1932 fd = dirfd(d->dp);
7387863d
DDAG
1933 if (datasync) {
1934 res = fdatasync(fd);
1935 } else {
1936 res = fsync(fd);
1937 }
acefdde7
SH
1938
1939 lo_dirp_put(&d);
1940
7387863d 1941 fuse_reply_err(req, res == -1 ? errno : 0);
7c6b6602
DDAG
1942}
1943
1944static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
1945{
7387863d 1946 int fd;
73b4d19d 1947 ssize_t fh;
7387863d
DDAG
1948 char buf[64];
1949 struct lo_data *lo = lo_data(req);
1950
d240314a
EG
1951 fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d)\n", ino,
1952 fi->flags);
7387863d 1953
8e4e41e3 1954 update_open_flags(lo->writeback, fi);
65da4539 1955
9f59d175
SH
1956 sprintf(buf, "%i", lo_fd(req, ino));
1957 fd = openat(lo->proc_self_fd, buf, fi->flags & ~O_NOFOLLOW);
7387863d
DDAG
1958 if (fd == -1) {
1959 return (void)fuse_reply_err(req, errno);
1960 }
1961
73b4d19d
SH
1962 pthread_mutex_lock(&lo->mutex);
1963 fh = lo_add_fd_mapping(req, fd);
1964 pthread_mutex_unlock(&lo->mutex);
1965 if (fh == -1) {
1966 close(fd);
1967 fuse_reply_err(req, ENOMEM);
1968 return;
1969 }
1970
1971 fi->fh = fh;
230e777b 1972 if (lo->cache == CACHE_NONE) {
7387863d
DDAG
1973 fi->direct_io = 1;
1974 } else if (lo->cache == CACHE_ALWAYS) {
1975 fi->keep_cache = 1;
1976 }
1977 fuse_reply_open(req, fi);
7c6b6602
DDAG
1978}
1979
7387863d
DDAG
1980static void lo_release(fuse_req_t req, fuse_ino_t ino,
1981 struct fuse_file_info *fi)
7c6b6602 1982{
73b4d19d 1983 struct lo_data *lo = lo_data(req);
baed65c0
SH
1984 struct lo_map_elem *elem;
1985 int fd = -1;
73b4d19d 1986
7387863d 1987 (void)ino;
7c6b6602 1988
73b4d19d 1989 pthread_mutex_lock(&lo->mutex);
baed65c0
SH
1990 elem = lo_map_get(&lo->fd_map, fi->fh);
1991 if (elem) {
1992 fd = elem->fd;
1993 elem = NULL;
1994 lo_map_remove(&lo->fd_map, fi->fh);
1995 }
73b4d19d
SH
1996 pthread_mutex_unlock(&lo->mutex);
1997
1998 close(fd);
7387863d 1999 fuse_reply_err(req, 0);
7c6b6602
DDAG
2000}
2001
2002static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
2003{
7387863d
DDAG
2004 int res;
2005 (void)ino;
0e81414c
VG
2006 struct lo_inode *inode;
2007
2008 inode = lo_inode(req, ino);
2009 if (!inode) {
2010 fuse_reply_err(req, EBADF);
2011 return;
2012 }
2013
2014 /* An fd is going away. Cleanup associated posix locks */
2015 pthread_mutex_lock(&inode->plock_mutex);
2016 g_hash_table_remove(inode->posix_locks, GUINT_TO_POINTER(fi->lock_owner));
2017 pthread_mutex_unlock(&inode->plock_mutex);
2018
73b4d19d 2019 res = close(dup(lo_fi_fd(req, fi)));
c241aa94 2020 lo_inode_put(lo_data(req), &inode);
7387863d 2021 fuse_reply_err(req, res == -1 ? errno : 0);
7c6b6602
DDAG
2022}
2023
2024static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync,
7387863d 2025 struct fuse_file_info *fi)
7c6b6602 2026{
7387863d 2027 int res;
1b209805
VG
2028 int fd;
2029 char *buf;
2030
2031 fuse_log(FUSE_LOG_DEBUG, "lo_fsync(ino=%" PRIu64 ", fi=0x%p)\n", ino,
2032 (void *)fi);
2033
2034 if (!fi) {
9f59d175
SH
2035 struct lo_data *lo = lo_data(req);
2036
2037 res = asprintf(&buf, "%i", lo_fd(req, ino));
1b209805
VG
2038 if (res == -1) {
2039 return (void)fuse_reply_err(req, errno);
2040 }
2041
9f59d175 2042 fd = openat(lo->proc_self_fd, buf, O_RDWR);
1b209805
VG
2043 free(buf);
2044 if (fd == -1) {
2045 return (void)fuse_reply_err(req, errno);
2046 }
2047 } else {
73b4d19d 2048 fd = lo_fi_fd(req, fi);
1b209805
VG
2049 }
2050
7387863d 2051 if (datasync) {
1b209805 2052 res = fdatasync(fd);
7387863d 2053 } else {
1b209805
VG
2054 res = fsync(fd);
2055 }
2056 if (!fi) {
2057 close(fd);
7387863d
DDAG
2058 }
2059 fuse_reply_err(req, res == -1 ? errno : 0);
7c6b6602
DDAG
2060}
2061
7387863d
DDAG
2062static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset,
2063 struct fuse_file_info *fi)
7c6b6602 2064{
7387863d 2065 struct fuse_bufvec buf = FUSE_BUFVEC_INIT(size);
7c6b6602 2066
d240314a
EG
2067 fuse_log(FUSE_LOG_DEBUG,
2068 "lo_read(ino=%" PRIu64 ", size=%zd, "
2069 "off=%lu)\n",
2070 ino, size, (unsigned long)offset);
7c6b6602 2071
7387863d 2072 buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK;
73b4d19d 2073 buf.buf[0].fd = lo_fi_fd(req, fi);
7387863d 2074 buf.buf[0].pos = offset;
7c6b6602 2075
8c3fe75e 2076 fuse_reply_data(req, &buf);
7c6b6602
DDAG
2077}
2078
2079static void lo_write_buf(fuse_req_t req, fuse_ino_t ino,
7387863d
DDAG
2080 struct fuse_bufvec *in_buf, off_t off,
2081 struct fuse_file_info *fi)
7c6b6602 2082{
7387863d
DDAG
2083 (void)ino;
2084 ssize_t res;
2085 struct fuse_bufvec out_buf = FUSE_BUFVEC_INIT(fuse_buf_size(in_buf));
ee884652 2086 bool cap_fsetid_dropped = false;
7387863d
DDAG
2087
2088 out_buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK;
73b4d19d 2089 out_buf.buf[0].fd = lo_fi_fd(req, fi);
7387863d
DDAG
2090 out_buf.buf[0].pos = off;
2091
d240314a
EG
2092 fuse_log(FUSE_LOG_DEBUG,
2093 "lo_write_buf(ino=%" PRIu64 ", size=%zd, off=%lu)\n", ino,
2094 out_buf.buf[0].size, (unsigned long)off);
7387863d 2095
ee884652
VG
2096 /*
2097 * If kill_priv is set, drop CAP_FSETID which should lead to kernel
2098 * clearing setuid/setgid on file.
2099 */
2100 if (fi->kill_priv) {
2101 res = drop_effective_cap("FSETID", &cap_fsetid_dropped);
2102 if (res != 0) {
2103 fuse_reply_err(req, res);
2104 return;
2105 }
2106 }
2107
8c3fe75e 2108 res = fuse_buf_copy(&out_buf, in_buf);
7387863d
DDAG
2109 if (res < 0) {
2110 fuse_reply_err(req, -res);
2111 } else {
2112 fuse_reply_write(req, (size_t)res);
2113 }
ee884652
VG
2114
2115 if (cap_fsetid_dropped) {
2116 res = gain_effective_cap("FSETID");
2117 if (res) {
2118 fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n");
2119 }
2120 }
7c6b6602
DDAG
2121}
2122
2123static void lo_statfs(fuse_req_t req, fuse_ino_t ino)
2124{
7387863d
DDAG
2125 int res;
2126 struct statvfs stbuf;
2127
2128 res = fstatvfs(lo_fd(req, ino), &stbuf);
2129 if (res == -1) {
2130 fuse_reply_err(req, errno);
2131 } else {
2132 fuse_reply_statfs(req, &stbuf);
2133 }
7c6b6602
DDAG
2134}
2135
7387863d
DDAG
2136static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset,
2137 off_t length, struct fuse_file_info *fi)
7c6b6602 2138{
7387863d
DDAG
2139 int err = EOPNOTSUPP;
2140 (void)ino;
7c6b6602 2141
9776457c 2142#ifdef CONFIG_FALLOCATE
73b4d19d 2143 err = fallocate(lo_fi_fd(req, fi), mode, offset, length);
7387863d
DDAG
2144 if (err < 0) {
2145 err = errno;
2146 }
7c6b6602 2147
9776457c 2148#elif defined(CONFIG_POSIX_FALLOCATE)
7387863d
DDAG
2149 if (mode) {
2150 fuse_reply_err(req, EOPNOTSUPP);
2151 return;
2152 }
7c6b6602 2153
73b4d19d 2154 err = posix_fallocate(lo_fi_fd(req, fi), offset, length);
7c6b6602
DDAG
2155#endif
2156
7387863d 2157 fuse_reply_err(req, err);
7c6b6602
DDAG
2158}
2159
2160static void lo_flock(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
7387863d 2161 int op)
7c6b6602 2162{
7387863d
DDAG
2163 int res;
2164 (void)ino;
7c6b6602 2165
73b4d19d 2166 res = flock(lo_fi_fd(req, fi), op);
7c6b6602 2167
7387863d 2168 fuse_reply_err(req, res == -1 ? errno : 0);
7c6b6602
DDAG
2169}
2170
2171static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name,
7387863d 2172 size_t size)
7c6b6602 2173{
9f59d175 2174 struct lo_data *lo = lo_data(req);
7387863d
DDAG
2175 char *value = NULL;
2176 char procname[64];
92fb57b8 2177 struct lo_inode *inode;
7387863d
DDAG
2178 ssize_t ret;
2179 int saverr;
9f59d175 2180 int fd = -1;
7387863d 2181
92fb57b8
SH
2182 inode = lo_inode(req, ino);
2183 if (!inode) {
2184 fuse_reply_err(req, EBADF);
2185 return;
2186 }
2187
7387863d
DDAG
2188 saverr = ENOSYS;
2189 if (!lo_data(req)->xattr) {
2190 goto out;
2191 }
2192
d240314a
EG
2193 fuse_log(FUSE_LOG_DEBUG, "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n",
2194 ino, name, size);
7387863d
DDAG
2195
2196 if (inode->is_symlink) {
2197 /* Sorry, no race free way to getxattr on symlink. */
2198 saverr = EPERM;
2199 goto out;
2200 }
2201
9f59d175
SH
2202 sprintf(procname, "%i", inode->fd);
2203 fd = openat(lo->proc_self_fd, procname, O_RDONLY);
2204 if (fd < 0) {
2205 goto out_err;
2206 }
7387863d
DDAG
2207
2208 if (size) {
2209 value = malloc(size);
2210 if (!value) {
2211 goto out_err;
2212 }
2213
9f59d175 2214 ret = fgetxattr(fd, name, value, size);
7387863d
DDAG
2215 if (ret == -1) {
2216 goto out_err;
2217 }
2218 saverr = 0;
2219 if (ret == 0) {
2220 goto out;
2221 }
2222
2223 fuse_reply_buf(req, value, ret);
2224 } else {
9f59d175 2225 ret = fgetxattr(fd, name, NULL, 0);
7387863d
DDAG
2226 if (ret == -1) {
2227 goto out_err;
2228 }
2229
2230 fuse_reply_xattr(req, ret);
2231 }
7c6b6602 2232out_free:
7387863d 2233 free(value);
9f59d175
SH
2234
2235 if (fd >= 0) {
2236 close(fd);
2237 }
c241aa94
SH
2238
2239 lo_inode_put(lo, &inode);
7387863d 2240 return;
7c6b6602
DDAG
2241
2242out_err:
7387863d 2243 saverr = errno;
7c6b6602 2244out:
c241aa94 2245 lo_inode_put(lo, &inode);
7387863d
DDAG
2246 fuse_reply_err(req, saverr);
2247 goto out_free;
7c6b6602
DDAG
2248}
2249
2250static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size)
2251{
9f59d175 2252 struct lo_data *lo = lo_data(req);
7387863d
DDAG
2253 char *value = NULL;
2254 char procname[64];
92fb57b8 2255 struct lo_inode *inode;
7387863d
DDAG
2256 ssize_t ret;
2257 int saverr;
9f59d175 2258 int fd = -1;
7387863d 2259
92fb57b8
SH
2260 inode = lo_inode(req, ino);
2261 if (!inode) {
2262 fuse_reply_err(req, EBADF);
2263 return;
2264 }
2265
7387863d
DDAG
2266 saverr = ENOSYS;
2267 if (!lo_data(req)->xattr) {
2268 goto out;
2269 }
2270
d240314a
EG
2271 fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", ino,
2272 size);
7387863d
DDAG
2273
2274 if (inode->is_symlink) {
2275 /* Sorry, no race free way to listxattr on symlink. */
2276 saverr = EPERM;
2277 goto out;
2278 }
2279
9f59d175
SH
2280 sprintf(procname, "%i", inode->fd);
2281 fd = openat(lo->proc_self_fd, procname, O_RDONLY);
2282 if (fd < 0) {
2283 goto out_err;
2284 }
7387863d
DDAG
2285
2286 if (size) {
2287 value = malloc(size);
2288 if (!value) {
2289 goto out_err;
2290 }
2291
9f59d175 2292 ret = flistxattr(fd, value, size);
7387863d
DDAG
2293 if (ret == -1) {
2294 goto out_err;
2295 }
2296 saverr = 0;
2297 if (ret == 0) {
2298 goto out;
2299 }
2300
2301 fuse_reply_buf(req, value, ret);
2302 } else {
9f59d175 2303 ret = flistxattr(fd, NULL, 0);
7387863d
DDAG
2304 if (ret == -1) {
2305 goto out_err;
2306 }
2307
2308 fuse_reply_xattr(req, ret);
2309 }
7c6b6602 2310out_free:
7387863d 2311 free(value);
9f59d175
SH
2312
2313 if (fd >= 0) {
2314 close(fd);
2315 }
c241aa94
SH
2316
2317 lo_inode_put(lo, &inode);
7387863d 2318 return;
7c6b6602
DDAG
2319
2320out_err:
7387863d 2321 saverr = errno;
7c6b6602 2322out:
c241aa94 2323 lo_inode_put(lo, &inode);
7387863d
DDAG
2324 fuse_reply_err(req, saverr);
2325 goto out_free;
7c6b6602
DDAG
2326}
2327
2328static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *name,
7387863d 2329 const char *value, size_t size, int flags)
7c6b6602 2330{
7387863d 2331 char procname[64];
9f59d175 2332 struct lo_data *lo = lo_data(req);
92fb57b8 2333 struct lo_inode *inode;
7387863d
DDAG
2334 ssize_t ret;
2335 int saverr;
9f59d175 2336 int fd = -1;
7c6b6602 2337
92fb57b8
SH
2338 inode = lo_inode(req, ino);
2339 if (!inode) {
2340 fuse_reply_err(req, EBADF);
2341 return;
2342 }
2343
7387863d
DDAG
2344 saverr = ENOSYS;
2345 if (!lo_data(req)->xattr) {
2346 goto out;
2347 }
7c6b6602 2348
d240314a
EG
2349 fuse_log(FUSE_LOG_DEBUG, "lo_setxattr(ino=%" PRIu64
2350 ", name=%s value=%s size=%zd)\n", ino, name, value, size);
7c6b6602 2351
7387863d
DDAG
2352 if (inode->is_symlink) {
2353 /* Sorry, no race free way to setxattr on symlink. */
2354 saverr = EPERM;
2355 goto out;
2356 }
7c6b6602 2357
9f59d175
SH
2358 sprintf(procname, "%i", inode->fd);
2359 fd = openat(lo->proc_self_fd, procname, O_RDWR);
2360 if (fd < 0) {
2361 saverr = errno;
2362 goto out;
2363 }
7c6b6602 2364
9f59d175 2365 ret = fsetxattr(fd, name, value, size, flags);
7387863d 2366 saverr = ret == -1 ? errno : 0;
7c6b6602
DDAG
2367
2368out:
9f59d175
SH
2369 if (fd >= 0) {
2370 close(fd);
2371 }
c241aa94
SH
2372
2373 lo_inode_put(lo, &inode);
7387863d 2374 fuse_reply_err(req, saverr);
7c6b6602
DDAG
2375}
2376
2377static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *name)
2378{
7387863d 2379 char procname[64];
9f59d175 2380 struct lo_data *lo = lo_data(req);
92fb57b8 2381 struct lo_inode *inode;
7387863d
DDAG
2382 ssize_t ret;
2383 int saverr;
9f59d175 2384 int fd = -1;
7c6b6602 2385
92fb57b8
SH
2386 inode = lo_inode(req, ino);
2387 if (!inode) {
2388 fuse_reply_err(req, EBADF);
2389 return;
2390 }
2391
7387863d
DDAG
2392 saverr = ENOSYS;
2393 if (!lo_data(req)->xattr) {
2394 goto out;
2395 }
7c6b6602 2396
d240314a
EG
2397 fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", ino,
2398 name);
7c6b6602 2399
7387863d
DDAG
2400 if (inode->is_symlink) {
2401 /* Sorry, no race free way to setxattr on symlink. */
2402 saverr = EPERM;
2403 goto out;
2404 }
7c6b6602 2405
9f59d175
SH
2406 sprintf(procname, "%i", inode->fd);
2407 fd = openat(lo->proc_self_fd, procname, O_RDWR);
2408 if (fd < 0) {
2409 saverr = errno;
2410 goto out;
2411 }
7c6b6602 2412
9f59d175 2413 ret = fremovexattr(fd, name);
7387863d 2414 saverr = ret == -1 ? errno : 0;
7c6b6602
DDAG
2415
2416out:
9f59d175
SH
2417 if (fd >= 0) {
2418 close(fd);
2419 }
c241aa94
SH
2420
2421 lo_inode_put(lo, &inode);
7387863d 2422 fuse_reply_err(req, saverr);
7c6b6602
DDAG
2423}
2424
2425#ifdef HAVE_COPY_FILE_RANGE
2426static void lo_copy_file_range(fuse_req_t req, fuse_ino_t ino_in, off_t off_in,
7387863d
DDAG
2427 struct fuse_file_info *fi_in, fuse_ino_t ino_out,
2428 off_t off_out, struct fuse_file_info *fi_out,
2429 size_t len, int flags)
7c6b6602 2430{
73b4d19d 2431 int in_fd, out_fd;
7387863d
DDAG
2432 ssize_t res;
2433
73b4d19d
SH
2434 in_fd = lo_fi_fd(req, fi_in);
2435 out_fd = lo_fi_fd(req, fi_out);
2436
2437 fuse_log(FUSE_LOG_DEBUG,
2438 "lo_copy_file_range(ino=%" PRIu64 "/fd=%d, "
2439 "off=%lu, ino=%" PRIu64 "/fd=%d, "
2440 "off=%lu, size=%zd, flags=0x%x)\n",
2441 ino_in, in_fd, off_in, ino_out, out_fd, off_out, len, flags);
7387863d 2442
73b4d19d 2443 res = copy_file_range(in_fd, &off_in, out_fd, &off_out, len, flags);
7387863d 2444 if (res < 0) {
a931b686 2445 fuse_reply_err(req, errno);
7387863d
DDAG
2446 } else {
2447 fuse_reply_write(req, res);
2448 }
7c6b6602
DDAG
2449}
2450#endif
2451
2452static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence,
7387863d 2453 struct fuse_file_info *fi)
7c6b6602 2454{
7387863d
DDAG
2455 off_t res;
2456
2457 (void)ino;
73b4d19d 2458 res = lseek(lo_fi_fd(req, fi), off, whence);
7387863d
DDAG
2459 if (res != -1) {
2460 fuse_reply_lseek(req, res);
2461 } else {
2462 fuse_reply_err(req, errno);
2463 }
7c6b6602
DDAG
2464}
2465
771b01eb
DDAG
2466static void lo_destroy(void *userdata)
2467{
2468 struct lo_data *lo = (struct lo_data *)userdata;
28f7a3b0 2469
fe4c1579 2470 pthread_mutex_lock(&lo->mutex);
28f7a3b0
SH
2471 while (true) {
2472 GHashTableIter iter;
2473 gpointer key, value;
2474
2475 g_hash_table_iter_init(&iter, lo->inodes);
2476 if (!g_hash_table_iter_next(&iter, &key, &value)) {
2477 break;
2478 }
2479
2480 struct lo_inode *inode = value;
fe4c1579 2481 unref_inode(lo, inode, inode->nlookup);
28f7a3b0 2482 }
fe4c1579 2483 pthread_mutex_unlock(&lo->mutex);
771b01eb
DDAG
2484}
2485
7c6b6602 2486static struct fuse_lowlevel_ops lo_oper = {
7387863d
DDAG
2487 .init = lo_init,
2488 .lookup = lo_lookup,
2489 .mkdir = lo_mkdir,
2490 .mknod = lo_mknod,
2491 .symlink = lo_symlink,
2492 .link = lo_link,
2493 .unlink = lo_unlink,
2494 .rmdir = lo_rmdir,
2495 .rename = lo_rename,
2496 .forget = lo_forget,
2497 .forget_multi = lo_forget_multi,
2498 .getattr = lo_getattr,
2499 .setattr = lo_setattr,
2500 .readlink = lo_readlink,
2501 .opendir = lo_opendir,
2502 .readdir = lo_readdir,
2503 .readdirplus = lo_readdirplus,
2504 .releasedir = lo_releasedir,
2505 .fsyncdir = lo_fsyncdir,
2506 .create = lo_create,
0e81414c
VG
2507 .getlk = lo_getlk,
2508 .setlk = lo_setlk,
7387863d
DDAG
2509 .open = lo_open,
2510 .release = lo_release,
2511 .flush = lo_flush,
2512 .fsync = lo_fsync,
2513 .read = lo_read,
2514 .write_buf = lo_write_buf,
2515 .statfs = lo_statfs,
2516 .fallocate = lo_fallocate,
2517 .flock = lo_flock,
2518 .getxattr = lo_getxattr,
2519 .listxattr = lo_listxattr,
2520 .setxattr = lo_setxattr,
2521 .removexattr = lo_removexattr,
7c6b6602 2522#ifdef HAVE_COPY_FILE_RANGE
7387863d 2523 .copy_file_range = lo_copy_file_range,
7c6b6602 2524#endif
7387863d 2525 .lseek = lo_lseek,
771b01eb 2526 .destroy = lo_destroy,
7c6b6602
DDAG
2527};
2528
45018fbb
SH
2529/* Print vhost-user.json backend program capabilities */
2530static void print_capabilities(void)
2531{
2532 printf("{\n");
2533 printf(" \"type\": \"fs\"\n");
2534 printf("}\n");
2535}
2536
d74830d1 2537/*
8e1d4ef2 2538 * Move to a new mount, net, and pid namespaces to isolate this process.
d74830d1 2539 */
8e1d4ef2 2540static void setup_namespaces(struct lo_data *lo, struct fuse_session *se)
d74830d1 2541{
8e1d4ef2
SH
2542 pid_t child;
2543
2544 /*
2545 * Create a new pid namespace for *child* processes. We'll have to
2546 * fork in order to enter the new pid namespace. A new mount namespace
2547 * is also needed so that we can remount /proc for the new pid
2548 * namespace.
2549 *
2550 * Our UNIX domain sockets have been created. Now we can move to
2551 * an empty network namespace to prevent TCP/IP and other network
2552 * activity in case this process is compromised.
2553 */
2554 if (unshare(CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWNET) != 0) {
2555 fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWPID | CLONE_NEWNS): %m\n");
2556 exit(1);
2557 }
2558
2559 child = fork();
2560 if (child < 0) {
2561 fuse_log(FUSE_LOG_ERR, "fork() failed: %m\n");
2562 exit(1);
2563 }
2564 if (child > 0) {
2565 pid_t waited;
2566 int wstatus;
2567
2568 /* The parent waits for the child */
2569 do {
2570 waited = waitpid(child, &wstatus, 0);
2571 } while (waited < 0 && errno == EINTR && !se->exited);
2572
2573 /* We were terminated by a signal, see fuse_signals.c */
2574 if (se->exited) {
2575 exit(0);
2576 }
2577
2578 if (WIFEXITED(wstatus)) {
2579 exit(WEXITSTATUS(wstatus));
2580 }
2581
2582 exit(1);
2583 }
2584
2585 /* Send us SIGTERM when the parent thread terminates, see prctl(2) */
2586 prctl(PR_SET_PDEATHSIG, SIGTERM);
2587
2588 /*
2589 * If the mounts have shared propagation then we want to opt out so our
2590 * mount changes don't affect the parent mount namespace.
2591 */
2592 if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL) < 0) {
2593 fuse_log(FUSE_LOG_ERR, "mount(/, MS_REC|MS_SLAVE): %m\n");
2594 exit(1);
2595 }
2596
2597 /* The child must remount /proc to use the new pid namespace */
2598 if (mount("proc", "/proc", "proc",
2599 MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RELATIME, NULL) < 0) {
2600 fuse_log(FUSE_LOG_ERR, "mount(/proc): %m\n");
2601 exit(1);
2602 }
2603
2604 /* Now we can get our /proc/self/fd directory file descriptor */
2605 lo->proc_self_fd = open("/proc/self/fd", O_PATH);
2606 if (lo->proc_self_fd == -1) {
2607 fuse_log(FUSE_LOG_ERR, "open(/proc/self/fd, O_PATH): %m\n");
d74830d1
SH
2608 exit(1);
2609 }
2610}
2611
2405f3c0
DDAG
2612/*
2613 * Capture the capability state, we'll need to restore this for individual
2614 * threads later; see load_capng.
2615 */
2616static void setup_capng(void)
2617{
2618 /* Note this accesses /proc so has to happen before the sandbox */
2619 if (capng_get_caps_process()) {
2620 fuse_log(FUSE_LOG_ERR, "capng_get_caps_process\n");
2621 exit(1);
2622 }
2623 pthread_mutex_init(&cap.mutex, NULL);
2624 pthread_mutex_lock(&cap.mutex);
2625 cap.saved = capng_save_state();
2626 if (!cap.saved) {
2627 fuse_log(FUSE_LOG_ERR, "capng_save_state\n");
2628 exit(1);
2629 }
2630 pthread_mutex_unlock(&cap.mutex);
2631}
2632
2633static void cleanup_capng(void)
2634{
2635 free(cap.saved);
2636 cap.saved = NULL;
2637 pthread_mutex_destroy(&cap.mutex);
2638}
2639
2640
8e1d4ef2
SH
2641/*
2642 * Make the source directory our root so symlinks cannot escape and no other
2643 * files are accessible. Assumes unshare(CLONE_NEWNS) was already called.
2644 */
2645static void setup_mounts(const char *source)
5baa3b8e
SH
2646{
2647 int oldroot;
2648 int newroot;
2649
8e1d4ef2
SH
2650 if (mount(source, source, NULL, MS_BIND, NULL) < 0) {
2651 fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source);
2652 exit(1);
2653 }
2654
2655 /* This magic is based on lxc's lxc_pivot_root() */
5baa3b8e
SH
2656 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
2657 if (oldroot < 0) {
2658 fuse_log(FUSE_LOG_ERR, "open(/): %m\n");
2659 exit(1);
2660 }
2661
2662 newroot = open(source, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
2663 if (newroot < 0) {
2664 fuse_log(FUSE_LOG_ERR, "open(%s): %m\n", source);
2665 exit(1);
2666 }
2667
2668 if (fchdir(newroot) < 0) {
2669 fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n");
2670 exit(1);
2671 }
2672
2673 if (syscall(__NR_pivot_root, ".", ".") < 0) {
2674 fuse_log(FUSE_LOG_ERR, "pivot_root(., .): %m\n");
2675 exit(1);
2676 }
2677
2678 if (fchdir(oldroot) < 0) {
2679 fuse_log(FUSE_LOG_ERR, "fchdir(oldroot): %m\n");
2680 exit(1);
2681 }
2682
2683 if (mount("", ".", "", MS_SLAVE | MS_REC, NULL) < 0) {
2684 fuse_log(FUSE_LOG_ERR, "mount(., MS_SLAVE | MS_REC): %m\n");
2685 exit(1);
2686 }
2687
2688 if (umount2(".", MNT_DETACH) < 0) {
2689 fuse_log(FUSE_LOG_ERR, "umount2(., MNT_DETACH): %m\n");
2690 exit(1);
2691 }
2692
2693 if (fchdir(newroot) < 0) {
2694 fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n");
2695 exit(1);
2696 }
2697
2698 close(newroot);
2699 close(oldroot);
2700}
2701
5baa3b8e
SH
2702/*
2703 * Lock down this process to prevent access to other processes or files outside
2704 * source directory. This reduces the impact of arbitrary code execution bugs.
2705 */
f185621d
SH
2706static void setup_sandbox(struct lo_data *lo, struct fuse_session *se,
2707 bool enable_syslog)
5baa3b8e 2708{
8e1d4ef2
SH
2709 setup_namespaces(lo, se);
2710 setup_mounts(lo->source);
f185621d 2711 setup_seccomp(enable_syslog);
5baa3b8e
SH
2712}
2713
01a6dc95
SH
2714/* Raise the maximum number of open file descriptors */
2715static void setup_nofile_rlimit(void)
2716{
2717 const rlim_t max_fds = 1000000;
2718 struct rlimit rlim;
2719
2720 if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) {
2721 fuse_log(FUSE_LOG_ERR, "getrlimit(RLIMIT_NOFILE): %m\n");
2722 exit(1);
2723 }
2724
2725 if (rlim.rlim_cur >= max_fds) {
2726 return; /* nothing to do */
2727 }
2728
2729 rlim.rlim_cur = max_fds;
2730 rlim.rlim_max = max_fds;
2731
2732 if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) {
2733 /* Ignore SELinux denials */
2734 if (errno == EPERM) {
2735 return;
2736 }
2737
2738 fuse_log(FUSE_LOG_ERR, "setrlimit(RLIMIT_NOFILE): %m\n");
2739 exit(1);
2740 }
2741}
2742
f185621d
SH
2743static void log_func(enum fuse_log_level level, const char *fmt, va_list ap)
2744{
36f38469
MM
2745 g_autofree char *localfmt = NULL;
2746
d240314a
EG
2747 if (current_log_level < level) {
2748 return;
2749 }
2750
36f38469 2751 if (current_log_level == FUSE_LOG_DEBUG) {
50fb955a
MM
2752 if (!use_syslog) {
2753 localfmt = g_strdup_printf("[%" PRId64 "] [ID: %08ld] %s",
2754 get_clock(), syscall(__NR_gettid), fmt);
2755 } else {
2756 localfmt = g_strdup_printf("[ID: %08ld] %s", syscall(__NR_gettid),
2757 fmt);
2758 }
36f38469
MM
2759 fmt = localfmt;
2760 }
2761
f185621d
SH
2762 if (use_syslog) {
2763 int priority = LOG_ERR;
2764 switch (level) {
2765 case FUSE_LOG_EMERG:
2766 priority = LOG_EMERG;
2767 break;
2768 case FUSE_LOG_ALERT:
2769 priority = LOG_ALERT;
2770 break;
2771 case FUSE_LOG_CRIT:
2772 priority = LOG_CRIT;
2773 break;
2774 case FUSE_LOG_ERR:
2775 priority = LOG_ERR;
2776 break;
2777 case FUSE_LOG_WARNING:
2778 priority = LOG_WARNING;
2779 break;
2780 case FUSE_LOG_NOTICE:
2781 priority = LOG_NOTICE;
2782 break;
2783 case FUSE_LOG_INFO:
2784 priority = LOG_INFO;
2785 break;
2786 case FUSE_LOG_DEBUG:
2787 priority = LOG_DEBUG;
2788 break;
2789 }
2790 vsyslog(priority, fmt, ap);
2791 } else {
2792 vfprintf(stderr, fmt, ap);
2793 }
2794}
2795
3ca8a2b1
MS
2796static void setup_root(struct lo_data *lo, struct lo_inode *root)
2797{
2798 int fd, res;
2799 struct stat stat;
2800
2801 fd = open("/", O_PATH);
2802 if (fd == -1) {
2803 fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", lo->source);
2804 exit(1);
2805 }
2806
2807 res = fstatat(fd, "", &stat, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
2808 if (res == -1) {
2809 fuse_log(FUSE_LOG_ERR, "fstatat(%s): %m\n", lo->source);
2810 exit(1);
2811 }
2812
2813 root->is_symlink = false;
2814 root->fd = fd;
bfc50a6e
MS
2815 root->key.ino = stat.st_ino;
2816 root->key.dev = stat.st_dev;
1222f015 2817 root->nlookup = 2;
c241aa94 2818 g_atomic_int_set(&root->refcount, 2);
3ca8a2b1
MS
2819}
2820
bfc50a6e
MS
2821static guint lo_key_hash(gconstpointer key)
2822{
2823 const struct lo_key *lkey = key;
2824
2825 return (guint)lkey->ino + (guint)lkey->dev;
2826}
2827
2828static gboolean lo_key_equal(gconstpointer a, gconstpointer b)
2829{
2830 const struct lo_key *la = a;
2831 const struct lo_key *lb = b;
2832
2833 return la->ino == lb->ino && la->dev == lb->dev;
2834}
2835
18a69cbb
LB
2836static void fuse_lo_data_cleanup(struct lo_data *lo)
2837{
2838 if (lo->inodes) {
2839 g_hash_table_destroy(lo->inodes);
2840 }
2841 lo_map_destroy(&lo->fd_map);
2842 lo_map_destroy(&lo->dirp_map);
2843 lo_map_destroy(&lo->ino_map);
2844
2845 if (lo->proc_self_fd >= 0) {
2846 close(lo->proc_self_fd);
2847 }
2848
2849 if (lo->root.fd >= 0) {
2850 close(lo->root.fd);
2851 }
2852
2853 free(lo->source);
2854}
2855
7c6b6602
DDAG
2856int main(int argc, char *argv[])
2857{
7387863d
DDAG
2858 struct fuse_args args = FUSE_ARGS_INIT(argc, argv);
2859 struct fuse_session *se;
2860 struct fuse_cmdline_opts opts;
9f59d175
SH
2861 struct lo_data lo = {
2862 .debug = 0,
2863 .writeback = 0,
0e81414c 2864 .posix_lock = 1,
9f59d175
SH
2865 .proc_self_fd = -1,
2866 };
92fb57b8 2867 struct lo_map_elem *root_elem;
7387863d
DDAG
2868 int ret = -1;
2869
2870 /* Don't mask creation mode, kernel already did that */
2871 umask(0);
2872
2873 pthread_mutex_init(&lo.mutex, NULL);
bfc50a6e 2874 lo.inodes = g_hash_table_new(lo_key_hash, lo_key_equal);
7387863d 2875 lo.root.fd = -1;
92fb57b8 2876 lo.root.fuse_ino = FUSE_ROOT_ID;
230e777b 2877 lo.cache = CACHE_AUTO;
7387863d 2878
92fb57b8
SH
2879 /*
2880 * Set up the ino map like this:
2881 * [0] Reserved (will not be used)
2882 * [1] Root inode
2883 */
2884 lo_map_init(&lo.ino_map);
2885 lo_map_reserve(&lo.ino_map, 0)->in_use = false;
2886 root_elem = lo_map_reserve(&lo.ino_map, lo.root.fuse_ino);
2887 root_elem->inode = &lo.root;
2888
b39bce12 2889 lo_map_init(&lo.dirp_map);
73b4d19d 2890 lo_map_init(&lo.fd_map);
b39bce12 2891
7387863d 2892 if (fuse_parse_cmdline(&args, &opts) != 0) {
c6de8046 2893 goto err_out1;
7387863d 2894 }
f185621d
SH
2895 fuse_set_log_func(log_func);
2896 use_syslog = opts.syslog;
2897 if (use_syslog) {
2898 openlog("virtiofsd", LOG_PID, LOG_DAEMON);
2899 }
c6de8046 2900
7387863d 2901 if (opts.show_help) {
67aab022 2902 printf("usage: %s [options]\n\n", argv[0]);
7387863d 2903 fuse_cmdline_help();
4ff075f7 2904 printf(" -o source=PATH shared directory tree\n");
7387863d
DDAG
2905 fuse_lowlevel_help();
2906 ret = 0;
2907 goto err_out1;
2908 } else if (opts.show_version) {
2909 fuse_lowlevel_version();
2910 ret = 0;
2911 goto err_out1;
45018fbb
SH
2912 } else if (opts.print_capabilities) {
2913 print_capabilities();
2914 ret = 0;
2915 goto err_out1;
7387863d
DDAG
2916 }
2917
7387863d 2918 if (fuse_opt_parse(&args, &lo, lo_opts, NULL) == -1) {
c6de8046 2919 goto err_out1;
7387863d
DDAG
2920 }
2921
d240314a
EG
2922 /*
2923 * log_level is 0 if not configured via cmd options (0 is LOG_EMERG,
2924 * and we don't use this log level).
2925 */
2926 if (opts.log_level != 0) {
2927 current_log_level = opts.log_level;
2928 }
7387863d 2929 lo.debug = opts.debug;
d240314a
EG
2930 if (lo.debug) {
2931 current_log_level = FUSE_LOG_DEBUG;
2932 }
7387863d
DDAG
2933 if (lo.source) {
2934 struct stat stat;
2935 int res;
2936
2937 res = lstat(lo.source, &stat);
2938 if (res == -1) {
2939 fuse_log(FUSE_LOG_ERR, "failed to stat source (\"%s\"): %m\n",
2940 lo.source);
2941 exit(1);
2942 }
2943 if (!S_ISDIR(stat.st_mode)) {
2944 fuse_log(FUSE_LOG_ERR, "source is not a directory\n");
2945 exit(1);
2946 }
7387863d 2947 } else {
eb68a33b 2948 lo.source = strdup("/");
7387863d 2949 }
7387863d
DDAG
2950 if (!lo.timeout_set) {
2951 switch (lo.cache) {
230e777b 2952 case CACHE_NONE:
7387863d
DDAG
2953 lo.timeout = 0.0;
2954 break;
2955
230e777b 2956 case CACHE_AUTO:
7387863d
DDAG
2957 lo.timeout = 1.0;
2958 break;
2959
2960 case CACHE_ALWAYS:
2961 lo.timeout = 86400.0;
2962 break;
2963 }
2964 } else if (lo.timeout < 0) {
2965 fuse_log(FUSE_LOG_ERR, "timeout is negative (%lf)\n", lo.timeout);
2966 exit(1);
2967 }
2968
7387863d
DDAG
2969 se = fuse_session_new(&args, &lo_oper, sizeof(lo_oper), &lo);
2970 if (se == NULL) {
2971 goto err_out1;
2972 }
2973
2974 if (fuse_set_signal_handlers(se) != 0) {
2975 goto err_out2;
2976 }
2977
67aab022 2978 if (fuse_session_mount(se) != 0) {
7387863d
DDAG
2979 goto err_out3;
2980 }
2981
2982 fuse_daemonize(opts.foreground);
2983
01a6dc95
SH
2984 setup_nofile_rlimit();
2985
2405f3c0
DDAG
2986 /* Must be before sandbox since it wants /proc */
2987 setup_capng();
2988
f185621d 2989 setup_sandbox(&lo, se, opts.syslog);
5baa3b8e 2990
3ca8a2b1 2991 setup_root(&lo, &lo.root);
7387863d 2992 /* Block until ctrl+c or fusermount -u */
f6f3573c 2993 ret = virtio_loop(se);
7387863d
DDAG
2994
2995 fuse_session_unmount(se);
2405f3c0 2996 cleanup_capng();
7c6b6602 2997err_out3:
7387863d 2998 fuse_remove_signal_handlers(se);
7c6b6602 2999err_out2:
7387863d 3000 fuse_session_destroy(se);
7c6b6602 3001err_out1:
7387863d 3002 fuse_opt_free_args(&args);
7c6b6602 3003
18a69cbb 3004 fuse_lo_data_cleanup(&lo);
eb68a33b 3005
7387863d 3006 return ret ? 1 : 0;
7c6b6602 3007}