]> git.proxmox.com Git - mirror_qemu.git/blame - tools/virtiofsd/passthrough_ll.c
virtiofsd: Reset O_DIRECT flag during file open
[mirror_qemu.git] / tools / virtiofsd / passthrough_ll.c
CommitLineData
7c6b6602 1/*
7387863d
DDAG
2 * FUSE: Filesystem in Userspace
3 * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
4 *
5 * This program can be distributed under the terms of the GNU GPLv2.
6 * See the file COPYING.
7 */
7c6b6602 8
7387863d 9/*
7c6b6602
DDAG
10 *
11 * This file system mirrors the existing file system hierarchy of the
12 * system, starting at the root file system. This is implemented by
13 * just "passing through" all requests to the corresponding user-space
14 * libc functions. In contrast to passthrough.c and passthrough_fh.c,
15 * this implementation uses the low-level API. Its performance should
16 * be the least bad among the three, but many operations are not
17 * implemented. In particular, it is not possible to remove files (or
18 * directories) because the code necessary to defer actual removal
19 * until the file is not opened anymore would make the example much
20 * more complicated.
21 *
22 * When writeback caching is enabled (-o writeback mount option), it
23 * is only possible to write to files for which the mounting user has
24 * read permissions. This is because the writeback cache requires the
25 * kernel to be able to issue read requests for all files (which the
26 * passthrough filesystem cannot satisfy if it can't read the file in
27 * the underlying filesystem).
28 *
29 * Compile with:
30 *
7387863d
DDAG
31 * gcc -Wall passthrough_ll.c `pkg-config fuse3 --cflags --libs` -o
32 * passthrough_ll
7c6b6602
DDAG
33 *
34 * ## Source code ##
35 * \include passthrough_ll.c
36 */
37
09863ebc 38#include "qemu/osdep.h"
50fb955a 39#include "qemu/timer.h"
f6f3573c 40#include "fuse_virtio.h"
d240314a 41#include "fuse_log.h"
09863ebc 42#include "fuse_lowlevel.h"
7c6b6602 43#include <assert.h>
2405f3c0 44#include <cap-ng.h>
7387863d 45#include <dirent.h>
7c6b6602 46#include <errno.h>
36f38469 47#include <glib.h>
7c6b6602 48#include <inttypes.h>
7387863d 49#include <limits.h>
7c6b6602 50#include <pthread.h>
7387863d
DDAG
51#include <stdbool.h>
52#include <stddef.h>
53#include <stdio.h>
54#include <stdlib.h>
55#include <string.h>
7c6b6602 56#include <sys/file.h>
5baa3b8e 57#include <sys/mount.h>
8e1d4ef2 58#include <sys/prctl.h>
01a6dc95 59#include <sys/resource.h>
929cfb7a 60#include <sys/syscall.h>
8e1d4ef2
SH
61#include <sys/types.h>
62#include <sys/wait.h>
7c6b6602 63#include <sys/xattr.h>
f185621d 64#include <syslog.h>
7387863d 65#include <unistd.h>
7c6b6602
DDAG
66
67#include "passthrough_helpers.h"
4f8bde99 68#include "seccomp.h"
7c6b6602 69
0e81414c
VG
70/* Keep track of inode posix locks for each owner. */
71struct lo_inode_plock {
72 uint64_t lock_owner;
73 int fd; /* fd for OFD locks */
74};
75
25c13572
SH
76struct lo_map_elem {
77 union {
92fb57b8 78 struct lo_inode *inode;
b39bce12 79 struct lo_dirp *dirp;
73b4d19d 80 int fd;
25c13572
SH
81 ssize_t freelist;
82 };
83 bool in_use;
84};
85
86/* Maps FUSE fh or ino values to internal objects */
87struct lo_map {
88 struct lo_map_elem *elems;
89 size_t nelems;
90 ssize_t freelist;
91};
92
bfc50a6e
MS
93struct lo_key {
94 ino_t ino;
95 dev_t dev;
96};
97
7c6b6602 98struct lo_inode {
7387863d 99 int fd;
c241aa94
SH
100
101 /*
102 * Atomic reference count for this object. The nlookup field holds a
103 * reference and release it when nlookup reaches 0.
104 */
105 gint refcount;
106
bfc50a6e 107 struct lo_key key;
1222f015
SH
108
109 /*
110 * This counter keeps the inode alive during the FUSE session.
111 * Incremented when the FUSE inode number is sent in a reply
112 * (FUSE_LOOKUP, FUSE_READDIRPLUS, etc). Decremented when an inode is
113 * released by requests like FUSE_FORGET, FUSE_RMDIR, FUSE_RENAME, etc.
114 *
115 * Note that this value is untrusted because the client can manipulate
116 * it arbitrarily using FUSE_FORGET requests.
117 *
118 * Protected by lo->mutex.
119 */
120 uint64_t nlookup;
121
92fb57b8 122 fuse_ino_t fuse_ino;
0e81414c
VG
123 pthread_mutex_t plock_mutex;
124 GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */
c241aa94
SH
125
126 bool is_symlink;
7c6b6602
DDAG
127};
128
929cfb7a
VG
129struct lo_cred {
130 uid_t euid;
131 gid_t egid;
132};
133
7c6b6602 134enum {
230e777b
MS
135 CACHE_NONE,
136 CACHE_AUTO,
7387863d 137 CACHE_ALWAYS,
7c6b6602
DDAG
138};
139
140struct lo_data {
7387863d
DDAG
141 pthread_mutex_t mutex;
142 int debug;
5fe319a7 143 int norace;
7387863d
DDAG
144 int writeback;
145 int flock;
0e81414c 146 int posix_lock;
7387863d 147 int xattr;
eb68a33b 148 char *source;
7387863d
DDAG
149 double timeout;
150 int cache;
151 int timeout_set;
59aef494
MS
152 int readdirplus_set;
153 int readdirplus_clear;
bfc50a6e
MS
154 struct lo_inode root;
155 GHashTable *inodes; /* protected by lo->mutex */
92fb57b8 156 struct lo_map ino_map; /* protected by lo->mutex */
b39bce12 157 struct lo_map dirp_map; /* protected by lo->mutex */
73b4d19d 158 struct lo_map fd_map; /* protected by lo->mutex */
9f59d175
SH
159
160 /* An O_PATH file descriptor to /proc/self/fd/ */
161 int proc_self_fd;
7c6b6602
DDAG
162};
163
164static const struct fuse_opt lo_opts[] = {
7387863d
DDAG
165 { "writeback", offsetof(struct lo_data, writeback), 1 },
166 { "no_writeback", offsetof(struct lo_data, writeback), 0 },
167 { "source=%s", offsetof(struct lo_data, source), 0 },
168 { "flock", offsetof(struct lo_data, flock), 1 },
169 { "no_flock", offsetof(struct lo_data, flock), 0 },
0e81414c
VG
170 { "posix_lock", offsetof(struct lo_data, posix_lock), 1 },
171 { "no_posix_lock", offsetof(struct lo_data, posix_lock), 0 },
7387863d
DDAG
172 { "xattr", offsetof(struct lo_data, xattr), 1 },
173 { "no_xattr", offsetof(struct lo_data, xattr), 0 },
174 { "timeout=%lf", offsetof(struct lo_data, timeout), 0 },
175 { "timeout=", offsetof(struct lo_data, timeout_set), 1 },
230e777b
MS
176 { "cache=none", offsetof(struct lo_data, cache), CACHE_NONE },
177 { "cache=auto", offsetof(struct lo_data, cache), CACHE_AUTO },
7387863d 178 { "cache=always", offsetof(struct lo_data, cache), CACHE_ALWAYS },
5fe319a7 179 { "norace", offsetof(struct lo_data, norace), 1 },
59aef494
MS
180 { "readdirplus", offsetof(struct lo_data, readdirplus_set), 1 },
181 { "no_readdirplus", offsetof(struct lo_data, readdirplus_clear), 1 },
7387863d 182 FUSE_OPT_END
7c6b6602 183};
f185621d 184static bool use_syslog = false;
d240314a 185static int current_log_level;
95d27157
MS
186static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
187 uint64_t n);
5fe319a7 188
2405f3c0
DDAG
189static struct {
190 pthread_mutex_t mutex;
191 void *saved;
192} cap;
193/* That we loaded cap-ng in the current thread from the saved */
194static __thread bool cap_loaded = 0;
195
5fe319a7
MS
196static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st);
197
25dae28c
SH
198static int is_dot_or_dotdot(const char *name)
199{
200 return name[0] == '.' &&
201 (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'));
202}
203
204/* Is `path` a single path component that is not "." or ".."? */
205static int is_safe_path_component(const char *path)
206{
207 if (strchr(path, '/')) {
208 return 0;
209 }
210
211 return !is_dot_or_dotdot(path);
212}
5fe319a7 213
7c6b6602
DDAG
214static struct lo_data *lo_data(fuse_req_t req)
215{
7387863d 216 return (struct lo_data *)fuse_req_userdata(req);
7c6b6602
DDAG
217}
218
2405f3c0
DDAG
219/*
220 * Load capng's state from our saved state if the current thread
221 * hadn't previously been loaded.
222 * returns 0 on success
223 */
224static int load_capng(void)
225{
226 if (!cap_loaded) {
227 pthread_mutex_lock(&cap.mutex);
228 capng_restore_state(&cap.saved);
229 /*
230 * restore_state free's the saved copy
231 * so make another.
232 */
233 cap.saved = capng_save_state();
234 if (!cap.saved) {
235 fuse_log(FUSE_LOG_ERR, "capng_save_state (thread)\n");
236 return -EINVAL;
237 }
238 pthread_mutex_unlock(&cap.mutex);
239
240 /*
241 * We want to use the loaded state for our pid,
242 * not the original
243 */
244 capng_setpid(syscall(SYS_gettid));
245 cap_loaded = true;
246 }
247 return 0;
248}
249
ee884652
VG
250/*
251 * Helpers for dropping and regaining effective capabilities. Returns 0
252 * on success, error otherwise
253 */
254static int drop_effective_cap(const char *cap_name, bool *cap_dropped)
255{
256 int cap, ret;
257
258 cap = capng_name_to_capability(cap_name);
259 if (cap < 0) {
260 ret = errno;
261 fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n",
262 cap_name, strerror(errno));
263 goto out;
264 }
265
266 if (load_capng()) {
267 ret = errno;
268 fuse_log(FUSE_LOG_ERR, "load_capng() failed\n");
269 goto out;
270 }
271
272 /* We dont have this capability in effective set already. */
273 if (!capng_have_capability(CAPNG_EFFECTIVE, cap)) {
274 ret = 0;
275 goto out;
276 }
277
278 if (capng_update(CAPNG_DROP, CAPNG_EFFECTIVE, cap)) {
279 ret = errno;
280 fuse_log(FUSE_LOG_ERR, "capng_update(DROP,) failed\n");
281 goto out;
282 }
283
284 if (capng_apply(CAPNG_SELECT_CAPS)) {
285 ret = errno;
286 fuse_log(FUSE_LOG_ERR, "drop:capng_apply() failed\n");
287 goto out;
288 }
289
290 ret = 0;
291 if (cap_dropped) {
292 *cap_dropped = true;
293 }
294
295out:
296 return ret;
297}
298
299static int gain_effective_cap(const char *cap_name)
300{
301 int cap;
302 int ret = 0;
303
304 cap = capng_name_to_capability(cap_name);
305 if (cap < 0) {
306 ret = errno;
307 fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n",
308 cap_name, strerror(errno));
309 goto out;
310 }
311
312 if (load_capng()) {
313 ret = errno;
314 fuse_log(FUSE_LOG_ERR, "load_capng() failed\n");
315 goto out;
316 }
317
318 if (capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, cap)) {
319 ret = errno;
320 fuse_log(FUSE_LOG_ERR, "capng_update(ADD,) failed\n");
321 goto out;
322 }
323
324 if (capng_apply(CAPNG_SELECT_CAPS)) {
325 ret = errno;
326 fuse_log(FUSE_LOG_ERR, "gain:capng_apply() failed\n");
327 goto out;
328 }
329 ret = 0;
330
331out:
332 return ret;
333}
334
92fb57b8 335static void lo_map_init(struct lo_map *map)
25c13572
SH
336{
337 map->elems = NULL;
338 map->nelems = 0;
339 map->freelist = -1;
340}
341
92fb57b8 342static void lo_map_destroy(struct lo_map *map)
25c13572
SH
343{
344 free(map->elems);
345}
346
347static int lo_map_grow(struct lo_map *map, size_t new_nelems)
348{
349 struct lo_map_elem *new_elems;
350 size_t i;
351
352 if (new_nelems <= map->nelems) {
353 return 1;
354 }
355
356 new_elems = realloc(map->elems, sizeof(map->elems[0]) * new_nelems);
357 if (!new_elems) {
358 return 0;
359 }
360
361 for (i = map->nelems; i < new_nelems; i++) {
362 new_elems[i].freelist = i + 1;
363 new_elems[i].in_use = false;
364 }
365 new_elems[new_nelems - 1].freelist = -1;
366
367 map->elems = new_elems;
368 map->freelist = map->nelems;
369 map->nelems = new_nelems;
370 return 1;
371}
372
92fb57b8 373static struct lo_map_elem *lo_map_alloc_elem(struct lo_map *map)
25c13572
SH
374{
375 struct lo_map_elem *elem;
376
377 if (map->freelist == -1 && !lo_map_grow(map, map->nelems + 256)) {
378 return NULL;
379 }
380
381 elem = &map->elems[map->freelist];
382 map->freelist = elem->freelist;
383
384 elem->in_use = true;
385
386 return elem;
387}
388
92fb57b8 389static struct lo_map_elem *lo_map_reserve(struct lo_map *map, size_t key)
25c13572
SH
390{
391 ssize_t *prev;
392
393 if (!lo_map_grow(map, key + 1)) {
394 return NULL;
395 }
396
397 for (prev = &map->freelist; *prev != -1;
398 prev = &map->elems[*prev].freelist) {
399 if (*prev == key) {
400 struct lo_map_elem *elem = &map->elems[key];
401
402 *prev = elem->freelist;
403 elem->in_use = true;
404 return elem;
405 }
406 }
407 return NULL;
408}
409
92fb57b8 410static struct lo_map_elem *lo_map_get(struct lo_map *map, size_t key)
25c13572
SH
411{
412 if (key >= map->nelems) {
413 return NULL;
414 }
415 if (!map->elems[key].in_use) {
416 return NULL;
417 }
418 return &map->elems[key];
419}
420
92fb57b8 421static void lo_map_remove(struct lo_map *map, size_t key)
25c13572
SH
422{
423 struct lo_map_elem *elem;
424
425 if (key >= map->nelems) {
426 return;
427 }
428
429 elem = &map->elems[key];
430 if (!elem->in_use) {
431 return;
432 }
433
434 elem->in_use = false;
435
436 elem->freelist = map->freelist;
437 map->freelist = key;
438}
439
73b4d19d
SH
440/* Assumes lo->mutex is held */
441static ssize_t lo_add_fd_mapping(fuse_req_t req, int fd)
442{
443 struct lo_map_elem *elem;
444
445 elem = lo_map_alloc_elem(&lo_data(req)->fd_map);
446 if (!elem) {
447 return -1;
448 }
449
450 elem->fd = fd;
451 return elem - lo_data(req)->fd_map.elems;
452}
453
b39bce12
SH
454/* Assumes lo->mutex is held */
455static ssize_t lo_add_dirp_mapping(fuse_req_t req, struct lo_dirp *dirp)
456{
457 struct lo_map_elem *elem;
458
459 elem = lo_map_alloc_elem(&lo_data(req)->dirp_map);
460 if (!elem) {
461 return -1;
462 }
463
464 elem->dirp = dirp;
465 return elem - lo_data(req)->dirp_map.elems;
466}
467
92fb57b8
SH
468/* Assumes lo->mutex is held */
469static ssize_t lo_add_inode_mapping(fuse_req_t req, struct lo_inode *inode)
470{
471 struct lo_map_elem *elem;
472
473 elem = lo_map_alloc_elem(&lo_data(req)->ino_map);
474 if (!elem) {
475 return -1;
476 }
477
478 elem->inode = inode;
479 return elem - lo_data(req)->ino_map.elems;
480}
481
c241aa94
SH
482static void lo_inode_put(struct lo_data *lo, struct lo_inode **inodep)
483{
484 struct lo_inode *inode = *inodep;
485
486 if (!inode) {
487 return;
488 }
489
490 *inodep = NULL;
491
492 if (g_atomic_int_dec_and_test(&inode->refcount)) {
493 close(inode->fd);
494 free(inode);
495 }
496}
497
498/* Caller must release refcount using lo_inode_put() */
7c6b6602
DDAG
499static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino)
500{
92fb57b8
SH
501 struct lo_data *lo = lo_data(req);
502 struct lo_map_elem *elem;
503
504 pthread_mutex_lock(&lo->mutex);
505 elem = lo_map_get(&lo->ino_map, ino);
c241aa94
SH
506 if (elem) {
507 g_atomic_int_inc(&elem->inode->refcount);
508 }
92fb57b8
SH
509 pthread_mutex_unlock(&lo->mutex);
510
511 if (!elem) {
512 return NULL;
7387863d 513 }
92fb57b8
SH
514
515 return elem->inode;
7c6b6602
DDAG
516}
517
c241aa94
SH
518/*
519 * TODO Remove this helper and force callers to hold an inode refcount until
520 * they are done with the fd. This will be done in a later patch to make
521 * review easier.
522 */
7c6b6602
DDAG
523static int lo_fd(fuse_req_t req, fuse_ino_t ino)
524{
92fb57b8 525 struct lo_inode *inode = lo_inode(req, ino);
c241aa94
SH
526 int fd;
527
528 if (!inode) {
529 return -1;
530 }
531
532 fd = inode->fd;
533 lo_inode_put(lo_data(req), &inode);
534 return fd;
7c6b6602
DDAG
535}
536
7387863d 537static void lo_init(void *userdata, struct fuse_conn_info *conn)
7c6b6602 538{
7387863d
DDAG
539 struct lo_data *lo = (struct lo_data *)userdata;
540
541 if (conn->capable & FUSE_CAP_EXPORT_SUPPORT) {
542 conn->want |= FUSE_CAP_EXPORT_SUPPORT;
543 }
544
545 if (lo->writeback && conn->capable & FUSE_CAP_WRITEBACK_CACHE) {
d240314a 546 fuse_log(FUSE_LOG_DEBUG, "lo_init: activating writeback\n");
7387863d
DDAG
547 conn->want |= FUSE_CAP_WRITEBACK_CACHE;
548 }
e468d4af
PT
549 if (conn->capable & FUSE_CAP_FLOCK_LOCKS) {
550 if (lo->flock) {
551 fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n");
552 conn->want |= FUSE_CAP_FLOCK_LOCKS;
553 } else {
554 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling flock locks\n");
555 conn->want &= ~FUSE_CAP_FLOCK_LOCKS;
556 }
7387863d 557 }
0e81414c
VG
558
559 if (conn->capable & FUSE_CAP_POSIX_LOCKS) {
560 if (lo->posix_lock) {
561 fuse_log(FUSE_LOG_DEBUG, "lo_init: activating posix locks\n");
562 conn->want |= FUSE_CAP_POSIX_LOCKS;
563 } else {
564 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix locks\n");
565 conn->want &= ~FUSE_CAP_POSIX_LOCKS;
566 }
567 }
568
230e777b 569 if ((lo->cache == CACHE_NONE && !lo->readdirplus_set) ||
59aef494 570 lo->readdirplus_clear) {
ddcbabcb
MS
571 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n");
572 conn->want &= ~FUSE_CAP_READDIRPLUS;
573 }
7c6b6602
DDAG
574}
575
576static void lo_getattr(fuse_req_t req, fuse_ino_t ino,
7387863d 577 struct fuse_file_info *fi)
7c6b6602 578{
7387863d
DDAG
579 int res;
580 struct stat buf;
581 struct lo_data *lo = lo_data(req);
7c6b6602 582
7387863d 583 (void)fi;
7c6b6602 584
7387863d
DDAG
585 res =
586 fstatat(lo_fd(req, ino), "", &buf, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
587 if (res == -1) {
588 return (void)fuse_reply_err(req, errno);
589 }
7c6b6602 590
7387863d 591 fuse_reply_attr(req, &buf, lo->timeout);
7c6b6602
DDAG
592}
593
c241aa94
SH
594/*
595 * Increments parent->nlookup and caller must release refcount using
596 * lo_inode_put(&parent).
597 */
5fe319a7
MS
598static int lo_parent_and_name(struct lo_data *lo, struct lo_inode *inode,
599 char path[PATH_MAX], struct lo_inode **parent)
7c6b6602 600{
7387863d 601 char procname[64];
5fe319a7
MS
602 char *last;
603 struct stat stat;
604 struct lo_inode *p;
605 int retries = 2;
606 int res;
607
608retry:
9f59d175 609 sprintf(procname, "%i", inode->fd);
5fe319a7 610
9f59d175 611 res = readlinkat(lo->proc_self_fd, procname, path, PATH_MAX);
5fe319a7
MS
612 if (res < 0) {
613 fuse_log(FUSE_LOG_WARNING, "%s: readlink failed: %m\n", __func__);
614 goto fail_noretry;
615 }
616
617 if (res >= PATH_MAX) {
618 fuse_log(FUSE_LOG_WARNING, "%s: readlink overflowed\n", __func__);
619 goto fail_noretry;
620 }
621 path[res] = '\0';
622
623 last = strrchr(path, '/');
624 if (last == NULL) {
625 /* Shouldn't happen */
626 fuse_log(
627 FUSE_LOG_WARNING,
628 "%s: INTERNAL ERROR: bad path read from proc\n", __func__);
629 goto fail_noretry;
630 }
631 if (last == path) {
632 p = &lo->root;
633 pthread_mutex_lock(&lo->mutex);
1222f015 634 p->nlookup++;
c241aa94 635 g_atomic_int_inc(&p->refcount);
5fe319a7
MS
636 pthread_mutex_unlock(&lo->mutex);
637 } else {
638 *last = '\0';
639 res = fstatat(AT_FDCWD, last == path ? "/" : path, &stat, 0);
640 if (res == -1) {
641 if (!retries) {
642 fuse_log(FUSE_LOG_WARNING,
643 "%s: failed to stat parent: %m\n", __func__);
644 }
645 goto fail;
646 }
647 p = lo_find(lo, &stat);
648 if (p == NULL) {
649 if (!retries) {
650 fuse_log(FUSE_LOG_WARNING,
651 "%s: failed to find parent\n", __func__);
652 }
653 goto fail;
654 }
655 }
656 last++;
657 res = fstatat(p->fd, last, &stat, AT_SYMLINK_NOFOLLOW);
658 if (res == -1) {
659 if (!retries) {
660 fuse_log(FUSE_LOG_WARNING,
661 "%s: failed to stat last\n", __func__);
662 }
663 goto fail_unref;
664 }
bfc50a6e 665 if (stat.st_dev != inode->key.dev || stat.st_ino != inode->key.ino) {
5fe319a7
MS
666 if (!retries) {
667 fuse_log(FUSE_LOG_WARNING,
668 "%s: failed to match last\n", __func__);
669 }
670 goto fail_unref;
671 }
672 *parent = p;
673 memmove(path, last, strlen(last) + 1);
674
675 return 0;
676
677fail_unref:
95d27157 678 unref_inode_lolocked(lo, p, 1);
c241aa94 679 lo_inode_put(lo, &p);
5fe319a7
MS
680fail:
681 if (retries) {
682 retries--;
683 goto retry;
684 }
685fail_noretry:
686 errno = EIO;
687 return -1;
688}
689
690static int utimensat_empty(struct lo_data *lo, struct lo_inode *inode,
691 const struct timespec *tv)
692{
693 int res;
694 struct lo_inode *parent;
695 char path[PATH_MAX];
7387863d
DDAG
696
697 if (inode->is_symlink) {
5fe319a7 698 res = utimensat(inode->fd, "", tv, AT_EMPTY_PATH);
7387863d
DDAG
699 if (res == -1 && errno == EINVAL) {
700 /* Sorry, no race free way to set times on symlink. */
5fe319a7
MS
701 if (lo->norace) {
702 errno = EPERM;
703 } else {
704 goto fallback;
705 }
7387863d
DDAG
706 }
707 return res;
708 }
9f59d175 709 sprintf(path, "%i", inode->fd);
5fe319a7 710
9f59d175 711 return utimensat(lo->proc_self_fd, path, tv, 0);
7387863d 712
5fe319a7
MS
713fallback:
714 res = lo_parent_and_name(lo, inode, path, &parent);
715 if (res != -1) {
716 res = utimensat(parent->fd, path, tv, AT_SYMLINK_NOFOLLOW);
95d27157 717 unref_inode_lolocked(lo, parent, 1);
c241aa94 718 lo_inode_put(lo, &parent);
5fe319a7
MS
719 }
720
721 return res;
7c6b6602
DDAG
722}
723
73b4d19d
SH
724static int lo_fi_fd(fuse_req_t req, struct fuse_file_info *fi)
725{
726 struct lo_data *lo = lo_data(req);
727 struct lo_map_elem *elem;
728
729 pthread_mutex_lock(&lo->mutex);
730 elem = lo_map_get(&lo->fd_map, fi->fh);
731 pthread_mutex_unlock(&lo->mutex);
732
733 if (!elem) {
734 return -1;
735 }
736
737 return elem->fd;
738}
739
7c6b6602 740static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr,
7387863d 741 int valid, struct fuse_file_info *fi)
7c6b6602 742{
7387863d
DDAG
743 int saverr;
744 char procname[64];
5fe319a7 745 struct lo_data *lo = lo_data(req);
92fb57b8
SH
746 struct lo_inode *inode;
747 int ifd;
7387863d 748 int res;
73b4d19d 749 int fd;
7387863d 750
92fb57b8
SH
751 inode = lo_inode(req, ino);
752 if (!inode) {
753 fuse_reply_err(req, EBADF);
754 return;
755 }
756
757 ifd = inode->fd;
758
73b4d19d
SH
759 /* If fi->fh is invalid we'll report EBADF later */
760 if (fi) {
761 fd = lo_fi_fd(req, fi);
762 }
763
7387863d
DDAG
764 if (valid & FUSE_SET_ATTR_MODE) {
765 if (fi) {
73b4d19d 766 res = fchmod(fd, attr->st_mode);
7387863d 767 } else {
9f59d175
SH
768 sprintf(procname, "%i", ifd);
769 res = fchmodat(lo->proc_self_fd, procname, attr->st_mode, 0);
7387863d
DDAG
770 }
771 if (res == -1) {
772 goto out_err;
773 }
774 }
775 if (valid & (FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID)) {
776 uid_t uid = (valid & FUSE_SET_ATTR_UID) ? attr->st_uid : (uid_t)-1;
777 gid_t gid = (valid & FUSE_SET_ATTR_GID) ? attr->st_gid : (gid_t)-1;
778
779 res = fchownat(ifd, "", uid, gid, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
780 if (res == -1) {
781 goto out_err;
782 }
783 }
784 if (valid & FUSE_SET_ATTR_SIZE) {
9f59d175
SH
785 int truncfd;
786
7387863d 787 if (fi) {
9f59d175 788 truncfd = fd;
7387863d 789 } else {
9f59d175
SH
790 sprintf(procname, "%i", ifd);
791 truncfd = openat(lo->proc_self_fd, procname, O_RDWR);
792 if (truncfd < 0) {
793 goto out_err;
794 }
795 }
796
797 res = ftruncate(truncfd, attr->st_size);
798 if (!fi) {
799 saverr = errno;
800 close(truncfd);
801 errno = saverr;
7387863d
DDAG
802 }
803 if (res == -1) {
804 goto out_err;
805 }
806 }
807 if (valid & (FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME)) {
808 struct timespec tv[2];
809
810 tv[0].tv_sec = 0;
811 tv[1].tv_sec = 0;
812 tv[0].tv_nsec = UTIME_OMIT;
813 tv[1].tv_nsec = UTIME_OMIT;
814
815 if (valid & FUSE_SET_ATTR_ATIME_NOW) {
816 tv[0].tv_nsec = UTIME_NOW;
817 } else if (valid & FUSE_SET_ATTR_ATIME) {
818 tv[0] = attr->st_atim;
819 }
820
821 if (valid & FUSE_SET_ATTR_MTIME_NOW) {
822 tv[1].tv_nsec = UTIME_NOW;
823 } else if (valid & FUSE_SET_ATTR_MTIME) {
824 tv[1] = attr->st_mtim;
825 }
826
827 if (fi) {
73b4d19d 828 res = futimens(fd, tv);
7387863d 829 } else {
5fe319a7 830 res = utimensat_empty(lo, inode, tv);
7387863d
DDAG
831 }
832 if (res == -1) {
833 goto out_err;
834 }
835 }
c241aa94 836 lo_inode_put(lo, &inode);
7387863d
DDAG
837
838 return lo_getattr(req, ino, fi);
7c6b6602
DDAG
839
840out_err:
7387863d 841 saverr = errno;
c241aa94 842 lo_inode_put(lo, &inode);
7387863d 843 fuse_reply_err(req, saverr);
7c6b6602
DDAG
844}
845
846static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st)
847{
7387863d 848 struct lo_inode *p;
bfc50a6e
MS
849 struct lo_key key = {
850 .ino = st->st_ino,
851 .dev = st->st_dev,
852 };
7387863d
DDAG
853
854 pthread_mutex_lock(&lo->mutex);
bfc50a6e
MS
855 p = g_hash_table_lookup(lo->inodes, &key);
856 if (p) {
1222f015
SH
857 assert(p->nlookup > 0);
858 p->nlookup++;
c241aa94 859 g_atomic_int_inc(&p->refcount);
7387863d
DDAG
860 }
861 pthread_mutex_unlock(&lo->mutex);
bfc50a6e
MS
862
863 return p;
7c6b6602
DDAG
864}
865
0e81414c
VG
866/* value_destroy_func for posix_locks GHashTable */
867static void posix_locks_value_destroy(gpointer data)
868{
869 struct lo_inode_plock *plock = data;
870
871 /*
872 * We had used open() for locks and had only one fd. So
873 * closing this fd should release all OFD locks.
874 */
875 close(plock->fd);
876 free(plock);
877}
878
c241aa94
SH
879/*
880 * Increments nlookup and caller must release refcount using
881 * lo_inode_put(&parent).
882 */
7c6b6602 883static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name,
7387863d 884 struct fuse_entry_param *e)
7c6b6602 885{
7387863d
DDAG
886 int newfd;
887 int res;
888 int saverr;
889 struct lo_data *lo = lo_data(req);
c241aa94
SH
890 struct lo_inode *inode = NULL;
891 struct lo_inode *dir = lo_inode(req, parent);
7387863d 892
9de4fab5
MS
893 /*
894 * name_to_handle_at() and open_by_handle_at() can reach here with fuse
895 * mount point in guest, but we don't have its inode info in the
896 * ino_map.
897 */
898 if (!dir) {
899 return ENOENT;
900 }
901
7387863d
DDAG
902 memset(e, 0, sizeof(*e));
903 e->attr_timeout = lo->timeout;
904 e->entry_timeout = lo->timeout;
905
854684bc
SH
906 /* Do not allow escaping root directory */
907 if (dir == &lo->root && strcmp(name, "..") == 0) {
908 name = ".";
909 }
910
9de4fab5 911 newfd = openat(dir->fd, name, O_PATH | O_NOFOLLOW);
7387863d
DDAG
912 if (newfd == -1) {
913 goto out_err;
914 }
915
916 res = fstatat(newfd, "", &e->attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
917 if (res == -1) {
918 goto out_err;
919 }
920
9de4fab5 921 inode = lo_find(lo, &e->attr);
7387863d
DDAG
922 if (inode) {
923 close(newfd);
924 newfd = -1;
925 } else {
7387863d
DDAG
926 inode = calloc(1, sizeof(struct lo_inode));
927 if (!inode) {
928 goto out_err;
929 }
930
931 inode->is_symlink = S_ISLNK(e->attr.st_mode);
c241aa94
SH
932
933 /*
934 * One for the caller and one for nlookup (released in
935 * unref_inode_lolocked())
936 */
937 g_atomic_int_set(&inode->refcount, 2);
938
1222f015 939 inode->nlookup = 1;
7387863d 940 inode->fd = newfd;
9de4fab5 941 newfd = -1;
bfc50a6e
MS
942 inode->key.ino = e->attr.st_ino;
943 inode->key.dev = e->attr.st_dev;
0e81414c
VG
944 pthread_mutex_init(&inode->plock_mutex, NULL);
945 inode->posix_locks = g_hash_table_new_full(
946 g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy);
7387863d
DDAG
947
948 pthread_mutex_lock(&lo->mutex);
92fb57b8 949 inode->fuse_ino = lo_add_inode_mapping(req, inode);
bfc50a6e 950 g_hash_table_insert(lo->inodes, &inode->key, inode);
7387863d
DDAG
951 pthread_mutex_unlock(&lo->mutex);
952 }
92fb57b8 953 e->ino = inode->fuse_ino;
c241aa94
SH
954 lo_inode_put(lo, &inode);
955 lo_inode_put(lo, &dir);
7387863d 956
d240314a
EG
957 fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
958 name, (unsigned long long)e->ino);
7387863d
DDAG
959
960 return 0;
7c6b6602
DDAG
961
962out_err:
7387863d
DDAG
963 saverr = errno;
964 if (newfd != -1) {
965 close(newfd);
966 }
c241aa94
SH
967 lo_inode_put(lo, &inode);
968 lo_inode_put(lo, &dir);
7387863d 969 return saverr;
7c6b6602
DDAG
970}
971
972static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name)
973{
7387863d
DDAG
974 struct fuse_entry_param e;
975 int err;
976
d240314a
EG
977 fuse_log(FUSE_LOG_DEBUG, "lo_lookup(parent=%" PRIu64 ", name=%s)\n", parent,
978 name);
7387863d 979
25dae28c
SH
980 /*
981 * Don't use is_safe_path_component(), allow "." and ".." for NFS export
982 * support.
983 */
984 if (strchr(name, '/')) {
985 fuse_reply_err(req, EINVAL);
986 return;
987 }
988
7387863d
DDAG
989 err = lo_do_lookup(req, parent, name, &e);
990 if (err) {
991 fuse_reply_err(req, err);
992 } else {
993 fuse_reply_entry(req, &e);
994 }
7c6b6602
DDAG
995}
996
929cfb7a
VG
997/*
998 * On some archs, setres*id is limited to 2^16 but they
999 * provide setres*id32 variants that allow 2^32.
1000 * Others just let setres*id do 2^32 anyway.
1001 */
1002#ifdef SYS_setresgid32
1003#define OURSYS_setresgid SYS_setresgid32
1004#else
1005#define OURSYS_setresgid SYS_setresgid
1006#endif
1007
1008#ifdef SYS_setresuid32
1009#define OURSYS_setresuid SYS_setresuid32
1010#else
1011#define OURSYS_setresuid SYS_setresuid
1012#endif
1013
1014/*
1015 * Change to uid/gid of caller so that file is created with
1016 * ownership of caller.
1017 * TODO: What about selinux context?
1018 */
1019static int lo_change_cred(fuse_req_t req, struct lo_cred *old)
1020{
1021 int res;
1022
1023 old->euid = geteuid();
1024 old->egid = getegid();
1025
1026 res = syscall(OURSYS_setresgid, -1, fuse_req_ctx(req)->gid, -1);
1027 if (res == -1) {
1028 return errno;
1029 }
1030
1031 res = syscall(OURSYS_setresuid, -1, fuse_req_ctx(req)->uid, -1);
1032 if (res == -1) {
1033 int errno_save = errno;
1034
1035 syscall(OURSYS_setresgid, -1, old->egid, -1);
1036 return errno_save;
1037 }
1038
1039 return 0;
1040}
1041
1042/* Regain Privileges */
1043static void lo_restore_cred(struct lo_cred *old)
1044{
1045 int res;
1046
1047 res = syscall(OURSYS_setresuid, -1, old->euid, -1);
1048 if (res == -1) {
1049 fuse_log(FUSE_LOG_ERR, "seteuid(%u): %m\n", old->euid);
1050 exit(1);
1051 }
1052
1053 res = syscall(OURSYS_setresgid, -1, old->egid, -1);
1054 if (res == -1) {
1055 fuse_log(FUSE_LOG_ERR, "setegid(%u): %m\n", old->egid);
1056 exit(1);
1057 }
1058}
1059
7c6b6602 1060static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent,
7387863d
DDAG
1061 const char *name, mode_t mode, dev_t rdev,
1062 const char *link)
7c6b6602 1063{
7387863d
DDAG
1064 int res;
1065 int saverr;
c241aa94 1066 struct lo_data *lo = lo_data(req);
92fb57b8 1067 struct lo_inode *dir;
7387863d 1068 struct fuse_entry_param e;
929cfb7a 1069 struct lo_cred old = {};
7c6b6602 1070
25dae28c
SH
1071 if (!is_safe_path_component(name)) {
1072 fuse_reply_err(req, EINVAL);
1073 return;
1074 }
1075
92fb57b8
SH
1076 dir = lo_inode(req, parent);
1077 if (!dir) {
1078 fuse_reply_err(req, EBADF);
1079 return;
1080 }
1081
7387863d 1082 saverr = ENOMEM;
7c6b6602 1083
929cfb7a
VG
1084 saverr = lo_change_cred(req, &old);
1085 if (saverr) {
1086 goto out;
1087 }
1088
7387863d 1089 res = mknod_wrapper(dir->fd, name, link, mode, rdev);
7c6b6602 1090
7387863d 1091 saverr = errno;
929cfb7a
VG
1092
1093 lo_restore_cred(&old);
1094
7387863d
DDAG
1095 if (res == -1) {
1096 goto out;
1097 }
7c6b6602 1098
7387863d
DDAG
1099 saverr = lo_do_lookup(req, parent, name, &e);
1100 if (saverr) {
1101 goto out;
1102 }
7c6b6602 1103
d240314a
EG
1104 fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
1105 name, (unsigned long long)e.ino);
7c6b6602 1106
7387863d 1107 fuse_reply_entry(req, &e);
c241aa94 1108 lo_inode_put(lo, &dir);
7387863d 1109 return;
7c6b6602
DDAG
1110
1111out:
c241aa94 1112 lo_inode_put(lo, &dir);
7387863d 1113 fuse_reply_err(req, saverr);
7c6b6602
DDAG
1114}
1115
7387863d
DDAG
1116static void lo_mknod(fuse_req_t req, fuse_ino_t parent, const char *name,
1117 mode_t mode, dev_t rdev)
7c6b6602 1118{
7387863d 1119 lo_mknod_symlink(req, parent, name, mode, rdev, NULL);
7c6b6602
DDAG
1120}
1121
1122static void lo_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name,
7387863d 1123 mode_t mode)
7c6b6602 1124{
7387863d 1125 lo_mknod_symlink(req, parent, name, S_IFDIR | mode, 0, NULL);
7c6b6602
DDAG
1126}
1127
7387863d
DDAG
1128static void lo_symlink(fuse_req_t req, const char *link, fuse_ino_t parent,
1129 const char *name)
7c6b6602 1130{
7387863d 1131 lo_mknod_symlink(req, parent, name, S_IFLNK, 0, link);
7c6b6602
DDAG
1132}
1133
5fe319a7
MS
1134static int linkat_empty_nofollow(struct lo_data *lo, struct lo_inode *inode,
1135 int dfd, const char *name)
7c6b6602 1136{
7387863d 1137 int res;
5fe319a7
MS
1138 struct lo_inode *parent;
1139 char path[PATH_MAX];
7c6b6602 1140
7387863d
DDAG
1141 if (inode->is_symlink) {
1142 res = linkat(inode->fd, "", dfd, name, AT_EMPTY_PATH);
1143 if (res == -1 && (errno == ENOENT || errno == EINVAL)) {
1144 /* Sorry, no race free way to hard-link a symlink. */
5fe319a7
MS
1145 if (lo->norace) {
1146 errno = EPERM;
1147 } else {
1148 goto fallback;
1149 }
7387863d
DDAG
1150 }
1151 return res;
1152 }
7c6b6602 1153
9f59d175 1154 sprintf(path, "%i", inode->fd);
5fe319a7 1155
9f59d175 1156 return linkat(lo->proc_self_fd, path, dfd, name, AT_SYMLINK_FOLLOW);
5fe319a7
MS
1157
1158fallback:
1159 res = lo_parent_and_name(lo, inode, path, &parent);
1160 if (res != -1) {
1161 res = linkat(parent->fd, path, dfd, name, 0);
95d27157 1162 unref_inode_lolocked(lo, parent, 1);
c241aa94 1163 lo_inode_put(lo, &parent);
5fe319a7 1164 }
7c6b6602 1165
5fe319a7 1166 return res;
7c6b6602
DDAG
1167}
1168
1169static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent,
7387863d 1170 const char *name)
7c6b6602 1171{
7387863d
DDAG
1172 int res;
1173 struct lo_data *lo = lo_data(req);
c241aa94 1174 struct lo_inode *parent_inode;
92fb57b8 1175 struct lo_inode *inode;
7387863d
DDAG
1176 struct fuse_entry_param e;
1177 int saverr;
1178
25dae28c
SH
1179 if (!is_safe_path_component(name)) {
1180 fuse_reply_err(req, EINVAL);
1181 return;
1182 }
1183
c241aa94 1184 parent_inode = lo_inode(req, parent);
92fb57b8 1185 inode = lo_inode(req, ino);
c241aa94
SH
1186 if (!parent_inode || !inode) {
1187 errno = EBADF;
1188 goto out_err;
92fb57b8
SH
1189 }
1190
7387863d
DDAG
1191 memset(&e, 0, sizeof(struct fuse_entry_param));
1192 e.attr_timeout = lo->timeout;
1193 e.entry_timeout = lo->timeout;
1194
c241aa94 1195 res = linkat_empty_nofollow(lo, inode, parent_inode->fd, name);
7387863d
DDAG
1196 if (res == -1) {
1197 goto out_err;
1198 }
1199
1200 res = fstatat(inode->fd, "", &e.attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1201 if (res == -1) {
1202 goto out_err;
1203 }
1204
1205 pthread_mutex_lock(&lo->mutex);
1222f015 1206 inode->nlookup++;
7387863d 1207 pthread_mutex_unlock(&lo->mutex);
92fb57b8 1208 e.ino = inode->fuse_ino;
7387863d 1209
d240314a
EG
1210 fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
1211 name, (unsigned long long)e.ino);
7387863d
DDAG
1212
1213 fuse_reply_entry(req, &e);
c241aa94
SH
1214 lo_inode_put(lo, &parent_inode);
1215 lo_inode_put(lo, &inode);
7387863d 1216 return;
7c6b6602
DDAG
1217
1218out_err:
7387863d 1219 saverr = errno;
c241aa94
SH
1220 lo_inode_put(lo, &parent_inode);
1221 lo_inode_put(lo, &inode);
7387863d 1222 fuse_reply_err(req, saverr);
7c6b6602
DDAG
1223}
1224
c241aa94 1225/* Increments nlookup and caller must release refcount using lo_inode_put() */
9257e514
MS
1226static struct lo_inode *lookup_name(fuse_req_t req, fuse_ino_t parent,
1227 const char *name)
1228{
1229 int res;
1230 struct stat attr;
1231
1232 res = fstatat(lo_fd(req, parent), name, &attr,
1233 AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1234 if (res == -1) {
1235 return NULL;
1236 }
1237
1238 return lo_find(lo_data(req), &attr);
1239}
1240
7c6b6602
DDAG
1241static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name)
1242{
7387863d 1243 int res;
9257e514
MS
1244 struct lo_inode *inode;
1245 struct lo_data *lo = lo_data(req);
1246
25dae28c
SH
1247 if (!is_safe_path_component(name)) {
1248 fuse_reply_err(req, EINVAL);
1249 return;
1250 }
7c6b6602 1251
9257e514
MS
1252 inode = lookup_name(req, parent, name);
1253 if (!inode) {
1254 fuse_reply_err(req, EIO);
1255 return;
1256 }
1257
7387863d 1258 res = unlinkat(lo_fd(req, parent), name, AT_REMOVEDIR);
7c6b6602 1259
7387863d 1260 fuse_reply_err(req, res == -1 ? errno : 0);
9257e514 1261 unref_inode_lolocked(lo, inode, 1);
c241aa94 1262 lo_inode_put(lo, &inode);
7c6b6602
DDAG
1263}
1264
1265static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name,
7387863d
DDAG
1266 fuse_ino_t newparent, const char *newname,
1267 unsigned int flags)
7c6b6602 1268{
7387863d 1269 int res;
c241aa94
SH
1270 struct lo_inode *parent_inode;
1271 struct lo_inode *newparent_inode;
1272 struct lo_inode *oldinode = NULL;
1273 struct lo_inode *newinode = NULL;
9257e514 1274 struct lo_data *lo = lo_data(req);
7c6b6602 1275
25dae28c
SH
1276 if (!is_safe_path_component(name) || !is_safe_path_component(newname)) {
1277 fuse_reply_err(req, EINVAL);
1278 return;
1279 }
1280
c241aa94
SH
1281 parent_inode = lo_inode(req, parent);
1282 newparent_inode = lo_inode(req, newparent);
1283 if (!parent_inode || !newparent_inode) {
1284 fuse_reply_err(req, EBADF);
1285 goto out;
1286 }
1287
9257e514
MS
1288 oldinode = lookup_name(req, parent, name);
1289 newinode = lookup_name(req, newparent, newname);
1290
1291 if (!oldinode) {
1292 fuse_reply_err(req, EIO);
1293 goto out;
1294 }
1295
7387863d 1296 if (flags) {
f0ab7d6f 1297#ifndef SYS_renameat2
7387863d 1298 fuse_reply_err(req, EINVAL);
f0ab7d6f 1299#else
c241aa94
SH
1300 res = syscall(SYS_renameat2, parent_inode->fd, name,
1301 newparent_inode->fd, newname, flags);
f0ab7d6f
MS
1302 if (res == -1 && errno == ENOSYS) {
1303 fuse_reply_err(req, EINVAL);
1304 } else {
1305 fuse_reply_err(req, res == -1 ? errno : 0);
1306 }
1307#endif
9257e514 1308 goto out;
7387863d 1309 }
7c6b6602 1310
c241aa94 1311 res = renameat(parent_inode->fd, name, newparent_inode->fd, newname);
7c6b6602 1312
7387863d 1313 fuse_reply_err(req, res == -1 ? errno : 0);
9257e514
MS
1314out:
1315 unref_inode_lolocked(lo, oldinode, 1);
1316 unref_inode_lolocked(lo, newinode, 1);
c241aa94
SH
1317 lo_inode_put(lo, &oldinode);
1318 lo_inode_put(lo, &newinode);
1319 lo_inode_put(lo, &parent_inode);
1320 lo_inode_put(lo, &newparent_inode);
7c6b6602
DDAG
1321}
1322
1323static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name)
1324{
7387863d 1325 int res;
9257e514
MS
1326 struct lo_inode *inode;
1327 struct lo_data *lo = lo_data(req);
7c6b6602 1328
25dae28c
SH
1329 if (!is_safe_path_component(name)) {
1330 fuse_reply_err(req, EINVAL);
1331 return;
1332 }
1333
9257e514
MS
1334 inode = lookup_name(req, parent, name);
1335 if (!inode) {
1336 fuse_reply_err(req, EIO);
1337 return;
1338 }
1339
7387863d 1340 res = unlinkat(lo_fd(req, parent), name, 0);
7c6b6602 1341
7387863d 1342 fuse_reply_err(req, res == -1 ? errno : 0);
9257e514 1343 unref_inode_lolocked(lo, inode, 1);
c241aa94 1344 lo_inode_put(lo, &inode);
7c6b6602
DDAG
1345}
1346
95d27157
MS
1347static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
1348 uint64_t n)
7c6b6602 1349{
7387863d
DDAG
1350 if (!inode) {
1351 return;
1352 }
1353
1354 pthread_mutex_lock(&lo->mutex);
1222f015
SH
1355 assert(inode->nlookup >= n);
1356 inode->nlookup -= n;
1357 if (!inode->nlookup) {
92fb57b8 1358 lo_map_remove(&lo->ino_map, inode->fuse_ino);
bfc50a6e 1359 g_hash_table_remove(lo->inodes, &inode->key);
0e81414c
VG
1360 if (g_hash_table_size(inode->posix_locks)) {
1361 fuse_log(FUSE_LOG_WARNING, "Hash table is not empty\n");
1362 }
1363 g_hash_table_destroy(inode->posix_locks);
1364 pthread_mutex_destroy(&inode->plock_mutex);
7387863d 1365 pthread_mutex_unlock(&lo->mutex);
c241aa94
SH
1366
1367 /* Drop our refcount from lo_do_lookup() */
1368 lo_inode_put(lo, &inode);
7387863d
DDAG
1369 } else {
1370 pthread_mutex_unlock(&lo->mutex);
1371 }
7c6b6602
DDAG
1372}
1373
771b01eb
DDAG
1374static int unref_all_inodes_cb(gpointer key, gpointer value, gpointer user_data)
1375{
1376 struct lo_inode *inode = value;
1377 struct lo_data *lo = user_data;
1378
1222f015 1379 inode->nlookup = 0;
771b01eb
DDAG
1380 lo_map_remove(&lo->ino_map, inode->fuse_ino);
1381 close(inode->fd);
c241aa94 1382 lo_inode_put(lo, &inode); /* Drop our refcount from lo_do_lookup() */
771b01eb
DDAG
1383
1384 return TRUE;
1385}
1386
1387static void unref_all_inodes(struct lo_data *lo)
1388{
1389 pthread_mutex_lock(&lo->mutex);
1390 g_hash_table_foreach_remove(lo->inodes, unref_all_inodes_cb, lo);
1391 pthread_mutex_unlock(&lo->mutex);
1392}
1393
7c6b6602
DDAG
1394static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup)
1395{
7387863d 1396 struct lo_data *lo = lo_data(req);
92fb57b8
SH
1397 struct lo_inode *inode;
1398
1399 inode = lo_inode(req, ino);
1400 if (!inode) {
1401 return;
1402 }
7c6b6602 1403
d240314a 1404 fuse_log(FUSE_LOG_DEBUG, " forget %lli %lli -%lli\n",
1222f015 1405 (unsigned long long)ino, (unsigned long long)inode->nlookup,
d240314a 1406 (unsigned long long)nlookup);
7c6b6602 1407
95d27157 1408 unref_inode_lolocked(lo, inode, nlookup);
c241aa94 1409 lo_inode_put(lo, &inode);
7c6b6602
DDAG
1410}
1411
1412static void lo_forget(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup)
1413{
7387863d
DDAG
1414 lo_forget_one(req, ino, nlookup);
1415 fuse_reply_none(req);
7c6b6602
DDAG
1416}
1417
1418static void lo_forget_multi(fuse_req_t req, size_t count,
7387863d 1419 struct fuse_forget_data *forgets)
7c6b6602 1420{
7387863d 1421 int i;
7c6b6602 1422
7387863d
DDAG
1423 for (i = 0; i < count; i++) {
1424 lo_forget_one(req, forgets[i].ino, forgets[i].nlookup);
1425 }
1426 fuse_reply_none(req);
7c6b6602
DDAG
1427}
1428
1429static void lo_readlink(fuse_req_t req, fuse_ino_t ino)
1430{
7387863d
DDAG
1431 char buf[PATH_MAX + 1];
1432 int res;
7c6b6602 1433
7387863d
DDAG
1434 res = readlinkat(lo_fd(req, ino), "", buf, sizeof(buf));
1435 if (res == -1) {
1436 return (void)fuse_reply_err(req, errno);
1437 }
7c6b6602 1438
7387863d
DDAG
1439 if (res == sizeof(buf)) {
1440 return (void)fuse_reply_err(req, ENAMETOOLONG);
1441 }
7c6b6602 1442
7387863d 1443 buf[res] = '\0';
7c6b6602 1444
7387863d 1445 fuse_reply_readlink(req, buf);
7c6b6602
DDAG
1446}
1447
1448struct lo_dirp {
acefdde7 1449 gint refcount;
7387863d
DDAG
1450 DIR *dp;
1451 struct dirent *entry;
1452 off_t offset;
7c6b6602
DDAG
1453};
1454
acefdde7
SH
1455static void lo_dirp_put(struct lo_dirp **dp)
1456{
1457 struct lo_dirp *d = *dp;
1458
1459 if (!d) {
1460 return;
1461 }
1462 *dp = NULL;
1463
1464 if (g_atomic_int_dec_and_test(&d->refcount)) {
1465 closedir(d->dp);
1466 free(d);
1467 }
1468}
1469
1470/* Call lo_dirp_put() on the return value when no longer needed */
b39bce12 1471static struct lo_dirp *lo_dirp(fuse_req_t req, struct fuse_file_info *fi)
7c6b6602 1472{
b39bce12
SH
1473 struct lo_data *lo = lo_data(req);
1474 struct lo_map_elem *elem;
1475
1476 pthread_mutex_lock(&lo->mutex);
1477 elem = lo_map_get(&lo->dirp_map, fi->fh);
acefdde7
SH
1478 if (elem) {
1479 g_atomic_int_inc(&elem->dirp->refcount);
1480 }
b39bce12
SH
1481 pthread_mutex_unlock(&lo->mutex);
1482 if (!elem) {
1483 return NULL;
1484 }
1485
1486 return elem->dirp;
7c6b6602
DDAG
1487}
1488
7387863d
DDAG
1489static void lo_opendir(fuse_req_t req, fuse_ino_t ino,
1490 struct fuse_file_info *fi)
7c6b6602 1491{
7387863d
DDAG
1492 int error = ENOMEM;
1493 struct lo_data *lo = lo_data(req);
1494 struct lo_dirp *d;
1495 int fd;
b39bce12 1496 ssize_t fh;
7387863d
DDAG
1497
1498 d = calloc(1, sizeof(struct lo_dirp));
1499 if (d == NULL) {
1500 goto out_err;
1501 }
1502
1503 fd = openat(lo_fd(req, ino), ".", O_RDONLY);
1504 if (fd == -1) {
1505 goto out_errno;
1506 }
1507
1508 d->dp = fdopendir(fd);
1509 if (d->dp == NULL) {
1510 goto out_errno;
1511 }
1512
1513 d->offset = 0;
1514 d->entry = NULL;
1515
acefdde7 1516 g_atomic_int_set(&d->refcount, 1); /* paired with lo_releasedir() */
b39bce12
SH
1517 pthread_mutex_lock(&lo->mutex);
1518 fh = lo_add_dirp_mapping(req, d);
1519 pthread_mutex_unlock(&lo->mutex);
1520 if (fh == -1) {
1521 goto out_err;
1522 }
1523
1524 fi->fh = fh;
7387863d
DDAG
1525 if (lo->cache == CACHE_ALWAYS) {
1526 fi->keep_cache = 1;
1527 }
1528 fuse_reply_open(req, fi);
1529 return;
7c6b6602
DDAG
1530
1531out_errno:
7387863d 1532 error = errno;
7c6b6602 1533out_err:
7387863d 1534 if (d) {
b39bce12
SH
1535 if (d->dp) {
1536 closedir(d->dp);
1537 }
7387863d
DDAG
1538 if (fd != -1) {
1539 close(fd);
1540 }
1541 free(d);
1542 }
1543 fuse_reply_err(req, error);
7c6b6602
DDAG
1544}
1545
7c6b6602 1546static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
7387863d 1547 off_t offset, struct fuse_file_info *fi, int plus)
7c6b6602 1548{
752272da 1549 struct lo_data *lo = lo_data(req);
acefdde7 1550 struct lo_dirp *d = NULL;
752272da 1551 struct lo_inode *dinode;
b39bce12 1552 char *buf = NULL;
7387863d
DDAG
1553 char *p;
1554 size_t rem = size;
752272da 1555 int err = EBADF;
7387863d 1556
752272da
SH
1557 dinode = lo_inode(req, ino);
1558 if (!dinode) {
1559 goto error;
1560 }
7387863d 1561
b39bce12
SH
1562 d = lo_dirp(req, fi);
1563 if (!d) {
1564 goto error;
1565 }
1566
752272da 1567 err = ENOMEM;
7387863d
DDAG
1568 buf = calloc(1, size);
1569 if (!buf) {
7387863d
DDAG
1570 goto error;
1571 }
1572 p = buf;
1573
1574 if (offset != d->offset) {
1575 seekdir(d->dp, offset);
1576 d->entry = NULL;
1577 d->offset = offset;
1578 }
1579 while (1) {
1580 size_t entsize;
1581 off_t nextoff;
1582 const char *name;
1583
1584 if (!d->entry) {
1585 errno = 0;
1586 d->entry = readdir(d->dp);
1587 if (!d->entry) {
1588 if (errno) { /* Error */
1589 err = errno;
1590 goto error;
1591 } else { /* End of stream */
1592 break;
1593 }
1594 }
1595 }
1596 nextoff = d->entry->d_off;
1597 name = d->entry->d_name;
752272da 1598
7387863d 1599 fuse_ino_t entry_ino = 0;
752272da
SH
1600 struct fuse_entry_param e = (struct fuse_entry_param){
1601 .attr.st_ino = d->entry->d_ino,
1602 .attr.st_mode = d->entry->d_type << 12,
1603 };
1604
1605 /* Hide root's parent directory */
1606 if (dinode == &lo->root && strcmp(name, "..") == 0) {
bfc50a6e 1607 e.attr.st_ino = lo->root.key.ino;
752272da
SH
1608 e.attr.st_mode = DT_DIR << 12;
1609 }
1610
7387863d 1611 if (plus) {
752272da 1612 if (!is_dot_or_dotdot(name)) {
7387863d
DDAG
1613 err = lo_do_lookup(req, ino, name, &e);
1614 if (err) {
1615 goto error;
1616 }
1617 entry_ino = e.ino;
1618 }
1619
1620 entsize = fuse_add_direntry_plus(req, p, rem, name, &e, nextoff);
1621 } else {
752272da 1622 entsize = fuse_add_direntry(req, p, rem, name, &e.attr, nextoff);
7387863d
DDAG
1623 }
1624 if (entsize > rem) {
1625 if (entry_ino != 0) {
1626 lo_forget_one(req, entry_ino, 1);
1627 }
1628 break;
1629 }
1630
1631 p += entsize;
1632 rem -= entsize;
1633
1634 d->entry = NULL;
1635 d->offset = nextoff;
1636 }
7c6b6602
DDAG
1637
1638 err = 0;
1639error:
acefdde7 1640 lo_dirp_put(&d);
c241aa94 1641 lo_inode_put(lo, &dinode);
acefdde7 1642
7387863d
DDAG
1643 /*
1644 * If there's an error, we can only signal it if we haven't stored
1645 * any entries yet - otherwise we'd end up with wrong lookup
1646 * counts for the entries that are already in the buffer. So we
1647 * return what we've collected until that point.
1648 */
1649 if (err && rem == size) {
1650 fuse_reply_err(req, err);
1651 } else {
1652 fuse_reply_buf(req, buf, size - rem);
1653 }
7c6b6602
DDAG
1654 free(buf);
1655}
1656
1657static void lo_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
7387863d 1658 off_t offset, struct fuse_file_info *fi)
7c6b6602 1659{
7387863d 1660 lo_do_readdir(req, ino, size, offset, fi, 0);
7c6b6602
DDAG
1661}
1662
1663static void lo_readdirplus(fuse_req_t req, fuse_ino_t ino, size_t size,
7387863d 1664 off_t offset, struct fuse_file_info *fi)
7c6b6602 1665{
7387863d 1666 lo_do_readdir(req, ino, size, offset, fi, 1);
7c6b6602
DDAG
1667}
1668
7387863d
DDAG
1669static void lo_releasedir(fuse_req_t req, fuse_ino_t ino,
1670 struct fuse_file_info *fi)
7c6b6602 1671{
b39bce12 1672 struct lo_data *lo = lo_data(req);
acefdde7 1673 struct lo_map_elem *elem;
b39bce12
SH
1674 struct lo_dirp *d;
1675
7387863d 1676 (void)ino;
b39bce12 1677
acefdde7
SH
1678 pthread_mutex_lock(&lo->mutex);
1679 elem = lo_map_get(&lo->dirp_map, fi->fh);
1680 if (!elem) {
1681 pthread_mutex_unlock(&lo->mutex);
b39bce12
SH
1682 fuse_reply_err(req, EBADF);
1683 return;
1684 }
1685
acefdde7 1686 d = elem->dirp;
b39bce12
SH
1687 lo_map_remove(&lo->dirp_map, fi->fh);
1688 pthread_mutex_unlock(&lo->mutex);
1689
acefdde7
SH
1690 lo_dirp_put(&d); /* paired with lo_opendir() */
1691
7387863d 1692 fuse_reply_err(req, 0);
7c6b6602
DDAG
1693}
1694
1695static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name,
7387863d 1696 mode_t mode, struct fuse_file_info *fi)
7c6b6602 1697{
7387863d
DDAG
1698 int fd;
1699 struct lo_data *lo = lo_data(req);
c241aa94 1700 struct lo_inode *parent_inode;
7387863d
DDAG
1701 struct fuse_entry_param e;
1702 int err;
929cfb7a 1703 struct lo_cred old = {};
7387863d 1704
d240314a
EG
1705 fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)\n", parent,
1706 name);
7387863d 1707
25dae28c
SH
1708 if (!is_safe_path_component(name)) {
1709 fuse_reply_err(req, EINVAL);
1710 return;
1711 }
1712
c241aa94
SH
1713 parent_inode = lo_inode(req, parent);
1714 if (!parent_inode) {
1715 fuse_reply_err(req, EBADF);
1716 return;
1717 }
1718
929cfb7a
VG
1719 err = lo_change_cred(req, &old);
1720 if (err) {
1721 goto out;
1722 }
1723
65da4539
VG
1724 /*
1725 * O_DIRECT in guest should not necessarily mean bypassing page
1726 * cache on host as well. If somebody needs that behavior, it
1727 * probably should be a configuration knob in daemon.
1728 */
1729 fi->flags &= ~O_DIRECT;
1730
c241aa94 1731 fd = openat(parent_inode->fd, name, (fi->flags | O_CREAT) & ~O_NOFOLLOW,
7387863d 1732 mode);
929cfb7a
VG
1733 err = fd == -1 ? errno : 0;
1734 lo_restore_cred(&old);
7387863d 1735
929cfb7a 1736 if (!err) {
73b4d19d
SH
1737 ssize_t fh;
1738
1739 pthread_mutex_lock(&lo->mutex);
1740 fh = lo_add_fd_mapping(req, fd);
1741 pthread_mutex_unlock(&lo->mutex);
1742 if (fh == -1) {
1743 close(fd);
c241aa94
SH
1744 err = ENOMEM;
1745 goto out;
73b4d19d
SH
1746 }
1747
1748 fi->fh = fh;
929cfb7a
VG
1749 err = lo_do_lookup(req, parent, name, &e);
1750 }
230e777b 1751 if (lo->cache == CACHE_NONE) {
7387863d
DDAG
1752 fi->direct_io = 1;
1753 } else if (lo->cache == CACHE_ALWAYS) {
1754 fi->keep_cache = 1;
1755 }
1756
929cfb7a 1757out:
c241aa94
SH
1758 lo_inode_put(lo, &parent_inode);
1759
7387863d
DDAG
1760 if (err) {
1761 fuse_reply_err(req, err);
1762 } else {
1763 fuse_reply_create(req, &e, fi);
1764 }
7c6b6602
DDAG
1765}
1766
0e81414c
VG
1767/* Should be called with inode->plock_mutex held */
1768static struct lo_inode_plock *lookup_create_plock_ctx(struct lo_data *lo,
1769 struct lo_inode *inode,
1770 uint64_t lock_owner,
1771 pid_t pid, int *err)
1772{
1773 struct lo_inode_plock *plock;
1774 char procname[64];
1775 int fd;
1776
1777 plock =
1778 g_hash_table_lookup(inode->posix_locks, GUINT_TO_POINTER(lock_owner));
1779
1780 if (plock) {
1781 return plock;
1782 }
1783
1784 plock = malloc(sizeof(struct lo_inode_plock));
1785 if (!plock) {
1786 *err = ENOMEM;
1787 return NULL;
1788 }
1789
1790 /* Open another instance of file which can be used for ofd locks. */
1791 sprintf(procname, "%i", inode->fd);
1792
1793 /* TODO: What if file is not writable? */
1794 fd = openat(lo->proc_self_fd, procname, O_RDWR);
1795 if (fd == -1) {
1796 *err = errno;
1797 free(plock);
1798 return NULL;
1799 }
1800
1801 plock->lock_owner = lock_owner;
1802 plock->fd = fd;
1803 g_hash_table_insert(inode->posix_locks, GUINT_TO_POINTER(plock->lock_owner),
1804 plock);
1805 return plock;
1806}
1807
1808static void lo_getlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
1809 struct flock *lock)
1810{
1811 struct lo_data *lo = lo_data(req);
1812 struct lo_inode *inode;
1813 struct lo_inode_plock *plock;
1814 int ret, saverr = 0;
1815
1816 fuse_log(FUSE_LOG_DEBUG,
1817 "lo_getlk(ino=%" PRIu64 ", flags=%d)"
1818 " owner=0x%lx, l_type=%d l_start=0x%lx"
1819 " l_len=0x%lx\n",
1820 ino, fi->flags, fi->lock_owner, lock->l_type, lock->l_start,
1821 lock->l_len);
1822
1823 inode = lo_inode(req, ino);
1824 if (!inode) {
1825 fuse_reply_err(req, EBADF);
1826 return;
1827 }
1828
1829 pthread_mutex_lock(&inode->plock_mutex);
1830 plock =
1831 lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret);
1832 if (!plock) {
c241aa94
SH
1833 saverr = ret;
1834 goto out;
0e81414c
VG
1835 }
1836
1837 ret = fcntl(plock->fd, F_OFD_GETLK, lock);
1838 if (ret == -1) {
1839 saverr = errno;
1840 }
c241aa94
SH
1841
1842out:
0e81414c 1843 pthread_mutex_unlock(&inode->plock_mutex);
c241aa94 1844 lo_inode_put(lo, &inode);
0e81414c
VG
1845
1846 if (saverr) {
1847 fuse_reply_err(req, saverr);
1848 } else {
1849 fuse_reply_lock(req, lock);
1850 }
1851}
1852
1853static void lo_setlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
1854 struct flock *lock, int sleep)
1855{
1856 struct lo_data *lo = lo_data(req);
1857 struct lo_inode *inode;
1858 struct lo_inode_plock *plock;
1859 int ret, saverr = 0;
1860
1861 fuse_log(FUSE_LOG_DEBUG,
1862 "lo_setlk(ino=%" PRIu64 ", flags=%d)"
1863 " cmd=%d pid=%d owner=0x%lx sleep=%d l_whence=%d"
1864 " l_start=0x%lx l_len=0x%lx\n",
1865 ino, fi->flags, lock->l_type, lock->l_pid, fi->lock_owner, sleep,
1866 lock->l_whence, lock->l_start, lock->l_len);
1867
1868 if (sleep) {
1869 fuse_reply_err(req, EOPNOTSUPP);
1870 return;
1871 }
1872
1873 inode = lo_inode(req, ino);
1874 if (!inode) {
1875 fuse_reply_err(req, EBADF);
1876 return;
1877 }
1878
1879 pthread_mutex_lock(&inode->plock_mutex);
1880 plock =
1881 lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret);
1882
1883 if (!plock) {
c241aa94
SH
1884 saverr = ret;
1885 goto out;
0e81414c
VG
1886 }
1887
1888 /* TODO: Is it alright to modify flock? */
1889 lock->l_pid = 0;
1890 ret = fcntl(plock->fd, F_OFD_SETLK, lock);
1891 if (ret == -1) {
1892 saverr = errno;
1893 }
c241aa94
SH
1894
1895out:
0e81414c 1896 pthread_mutex_unlock(&inode->plock_mutex);
c241aa94
SH
1897 lo_inode_put(lo, &inode);
1898
0e81414c
VG
1899 fuse_reply_err(req, saverr);
1900}
1901
7c6b6602 1902static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync,
7387863d 1903 struct fuse_file_info *fi)
7c6b6602 1904{
7387863d 1905 int res;
b39bce12
SH
1906 struct lo_dirp *d;
1907 int fd;
1908
7387863d 1909 (void)ino;
b39bce12
SH
1910
1911 d = lo_dirp(req, fi);
1912 if (!d) {
1913 fuse_reply_err(req, EBADF);
1914 return;
1915 }
1916
1917 fd = dirfd(d->dp);
7387863d
DDAG
1918 if (datasync) {
1919 res = fdatasync(fd);
1920 } else {
1921 res = fsync(fd);
1922 }
acefdde7
SH
1923
1924 lo_dirp_put(&d);
1925
7387863d 1926 fuse_reply_err(req, res == -1 ? errno : 0);
7c6b6602
DDAG
1927}
1928
1929static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
1930{
7387863d 1931 int fd;
73b4d19d 1932 ssize_t fh;
7387863d
DDAG
1933 char buf[64];
1934 struct lo_data *lo = lo_data(req);
1935
d240314a
EG
1936 fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d)\n", ino,
1937 fi->flags);
7387863d
DDAG
1938
1939 /*
1940 * With writeback cache, kernel may send read requests even
1941 * when userspace opened write-only
1942 */
1943 if (lo->writeback && (fi->flags & O_ACCMODE) == O_WRONLY) {
1944 fi->flags &= ~O_ACCMODE;
1945 fi->flags |= O_RDWR;
1946 }
1947
1948 /*
1949 * With writeback cache, O_APPEND is handled by the kernel.
1950 * This breaks atomicity (since the file may change in the
1951 * underlying filesystem, so that the kernel's idea of the
1952 * end of the file isn't accurate anymore). In this example,
1953 * we just accept that. A more rigorous filesystem may want
1954 * to return an error here
1955 */
1956 if (lo->writeback && (fi->flags & O_APPEND)) {
1957 fi->flags &= ~O_APPEND;
1958 }
1959
65da4539
VG
1960 /*
1961 * O_DIRECT in guest should not necessarily mean bypassing page
1962 * cache on host as well. If somebody needs that behavior, it
1963 * probably should be a configuration knob in daemon.
1964 */
1965 fi->flags &= ~O_DIRECT;
1966
9f59d175
SH
1967 sprintf(buf, "%i", lo_fd(req, ino));
1968 fd = openat(lo->proc_self_fd, buf, fi->flags & ~O_NOFOLLOW);
7387863d
DDAG
1969 if (fd == -1) {
1970 return (void)fuse_reply_err(req, errno);
1971 }
1972
73b4d19d
SH
1973 pthread_mutex_lock(&lo->mutex);
1974 fh = lo_add_fd_mapping(req, fd);
1975 pthread_mutex_unlock(&lo->mutex);
1976 if (fh == -1) {
1977 close(fd);
1978 fuse_reply_err(req, ENOMEM);
1979 return;
1980 }
1981
1982 fi->fh = fh;
230e777b 1983 if (lo->cache == CACHE_NONE) {
7387863d
DDAG
1984 fi->direct_io = 1;
1985 } else if (lo->cache == CACHE_ALWAYS) {
1986 fi->keep_cache = 1;
1987 }
1988 fuse_reply_open(req, fi);
7c6b6602
DDAG
1989}
1990
7387863d
DDAG
1991static void lo_release(fuse_req_t req, fuse_ino_t ino,
1992 struct fuse_file_info *fi)
7c6b6602 1993{
73b4d19d 1994 struct lo_data *lo = lo_data(req);
baed65c0
SH
1995 struct lo_map_elem *elem;
1996 int fd = -1;
73b4d19d 1997
7387863d 1998 (void)ino;
7c6b6602 1999
73b4d19d 2000 pthread_mutex_lock(&lo->mutex);
baed65c0
SH
2001 elem = lo_map_get(&lo->fd_map, fi->fh);
2002 if (elem) {
2003 fd = elem->fd;
2004 elem = NULL;
2005 lo_map_remove(&lo->fd_map, fi->fh);
2006 }
73b4d19d
SH
2007 pthread_mutex_unlock(&lo->mutex);
2008
2009 close(fd);
7387863d 2010 fuse_reply_err(req, 0);
7c6b6602
DDAG
2011}
2012
2013static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
2014{
7387863d
DDAG
2015 int res;
2016 (void)ino;
0e81414c
VG
2017 struct lo_inode *inode;
2018
2019 inode = lo_inode(req, ino);
2020 if (!inode) {
2021 fuse_reply_err(req, EBADF);
2022 return;
2023 }
2024
2025 /* An fd is going away. Cleanup associated posix locks */
2026 pthread_mutex_lock(&inode->plock_mutex);
2027 g_hash_table_remove(inode->posix_locks, GUINT_TO_POINTER(fi->lock_owner));
2028 pthread_mutex_unlock(&inode->plock_mutex);
2029
73b4d19d 2030 res = close(dup(lo_fi_fd(req, fi)));
c241aa94 2031 lo_inode_put(lo_data(req), &inode);
7387863d 2032 fuse_reply_err(req, res == -1 ? errno : 0);
7c6b6602
DDAG
2033}
2034
2035static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync,
7387863d 2036 struct fuse_file_info *fi)
7c6b6602 2037{
7387863d 2038 int res;
1b209805
VG
2039 int fd;
2040 char *buf;
2041
2042 fuse_log(FUSE_LOG_DEBUG, "lo_fsync(ino=%" PRIu64 ", fi=0x%p)\n", ino,
2043 (void *)fi);
2044
2045 if (!fi) {
9f59d175
SH
2046 struct lo_data *lo = lo_data(req);
2047
2048 res = asprintf(&buf, "%i", lo_fd(req, ino));
1b209805
VG
2049 if (res == -1) {
2050 return (void)fuse_reply_err(req, errno);
2051 }
2052
9f59d175 2053 fd = openat(lo->proc_self_fd, buf, O_RDWR);
1b209805
VG
2054 free(buf);
2055 if (fd == -1) {
2056 return (void)fuse_reply_err(req, errno);
2057 }
2058 } else {
73b4d19d 2059 fd = lo_fi_fd(req, fi);
1b209805
VG
2060 }
2061
7387863d 2062 if (datasync) {
1b209805 2063 res = fdatasync(fd);
7387863d 2064 } else {
1b209805
VG
2065 res = fsync(fd);
2066 }
2067 if (!fi) {
2068 close(fd);
7387863d
DDAG
2069 }
2070 fuse_reply_err(req, res == -1 ? errno : 0);
7c6b6602
DDAG
2071}
2072
7387863d
DDAG
2073static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset,
2074 struct fuse_file_info *fi)
7c6b6602 2075{
7387863d 2076 struct fuse_bufvec buf = FUSE_BUFVEC_INIT(size);
7c6b6602 2077
d240314a
EG
2078 fuse_log(FUSE_LOG_DEBUG,
2079 "lo_read(ino=%" PRIu64 ", size=%zd, "
2080 "off=%lu)\n",
2081 ino, size, (unsigned long)offset);
7c6b6602 2082
7387863d 2083 buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK;
73b4d19d 2084 buf.buf[0].fd = lo_fi_fd(req, fi);
7387863d 2085 buf.buf[0].pos = offset;
7c6b6602 2086
8c3fe75e 2087 fuse_reply_data(req, &buf);
7c6b6602
DDAG
2088}
2089
2090static void lo_write_buf(fuse_req_t req, fuse_ino_t ino,
7387863d
DDAG
2091 struct fuse_bufvec *in_buf, off_t off,
2092 struct fuse_file_info *fi)
7c6b6602 2093{
7387863d
DDAG
2094 (void)ino;
2095 ssize_t res;
2096 struct fuse_bufvec out_buf = FUSE_BUFVEC_INIT(fuse_buf_size(in_buf));
ee884652 2097 bool cap_fsetid_dropped = false;
7387863d
DDAG
2098
2099 out_buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK;
73b4d19d 2100 out_buf.buf[0].fd = lo_fi_fd(req, fi);
7387863d
DDAG
2101 out_buf.buf[0].pos = off;
2102
d240314a
EG
2103 fuse_log(FUSE_LOG_DEBUG,
2104 "lo_write_buf(ino=%" PRIu64 ", size=%zd, off=%lu)\n", ino,
2105 out_buf.buf[0].size, (unsigned long)off);
7387863d 2106
ee884652
VG
2107 /*
2108 * If kill_priv is set, drop CAP_FSETID which should lead to kernel
2109 * clearing setuid/setgid on file.
2110 */
2111 if (fi->kill_priv) {
2112 res = drop_effective_cap("FSETID", &cap_fsetid_dropped);
2113 if (res != 0) {
2114 fuse_reply_err(req, res);
2115 return;
2116 }
2117 }
2118
8c3fe75e 2119 res = fuse_buf_copy(&out_buf, in_buf);
7387863d
DDAG
2120 if (res < 0) {
2121 fuse_reply_err(req, -res);
2122 } else {
2123 fuse_reply_write(req, (size_t)res);
2124 }
ee884652
VG
2125
2126 if (cap_fsetid_dropped) {
2127 res = gain_effective_cap("FSETID");
2128 if (res) {
2129 fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n");
2130 }
2131 }
7c6b6602
DDAG
2132}
2133
2134static void lo_statfs(fuse_req_t req, fuse_ino_t ino)
2135{
7387863d
DDAG
2136 int res;
2137 struct statvfs stbuf;
2138
2139 res = fstatvfs(lo_fd(req, ino), &stbuf);
2140 if (res == -1) {
2141 fuse_reply_err(req, errno);
2142 } else {
2143 fuse_reply_statfs(req, &stbuf);
2144 }
7c6b6602
DDAG
2145}
2146
7387863d
DDAG
2147static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset,
2148 off_t length, struct fuse_file_info *fi)
7c6b6602 2149{
7387863d
DDAG
2150 int err = EOPNOTSUPP;
2151 (void)ino;
7c6b6602 2152
9776457c 2153#ifdef CONFIG_FALLOCATE
73b4d19d 2154 err = fallocate(lo_fi_fd(req, fi), mode, offset, length);
7387863d
DDAG
2155 if (err < 0) {
2156 err = errno;
2157 }
7c6b6602 2158
9776457c 2159#elif defined(CONFIG_POSIX_FALLOCATE)
7387863d
DDAG
2160 if (mode) {
2161 fuse_reply_err(req, EOPNOTSUPP);
2162 return;
2163 }
7c6b6602 2164
73b4d19d 2165 err = posix_fallocate(lo_fi_fd(req, fi), offset, length);
7c6b6602
DDAG
2166#endif
2167
7387863d 2168 fuse_reply_err(req, err);
7c6b6602
DDAG
2169}
2170
2171static void lo_flock(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
7387863d 2172 int op)
7c6b6602 2173{
7387863d
DDAG
2174 int res;
2175 (void)ino;
7c6b6602 2176
73b4d19d 2177 res = flock(lo_fi_fd(req, fi), op);
7c6b6602 2178
7387863d 2179 fuse_reply_err(req, res == -1 ? errno : 0);
7c6b6602
DDAG
2180}
2181
2182static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name,
7387863d 2183 size_t size)
7c6b6602 2184{
9f59d175 2185 struct lo_data *lo = lo_data(req);
7387863d
DDAG
2186 char *value = NULL;
2187 char procname[64];
92fb57b8 2188 struct lo_inode *inode;
7387863d
DDAG
2189 ssize_t ret;
2190 int saverr;
9f59d175 2191 int fd = -1;
7387863d 2192
92fb57b8
SH
2193 inode = lo_inode(req, ino);
2194 if (!inode) {
2195 fuse_reply_err(req, EBADF);
2196 return;
2197 }
2198
7387863d
DDAG
2199 saverr = ENOSYS;
2200 if (!lo_data(req)->xattr) {
2201 goto out;
2202 }
2203
d240314a
EG
2204 fuse_log(FUSE_LOG_DEBUG, "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n",
2205 ino, name, size);
7387863d
DDAG
2206
2207 if (inode->is_symlink) {
2208 /* Sorry, no race free way to getxattr on symlink. */
2209 saverr = EPERM;
2210 goto out;
2211 }
2212
9f59d175
SH
2213 sprintf(procname, "%i", inode->fd);
2214 fd = openat(lo->proc_self_fd, procname, O_RDONLY);
2215 if (fd < 0) {
2216 goto out_err;
2217 }
7387863d
DDAG
2218
2219 if (size) {
2220 value = malloc(size);
2221 if (!value) {
2222 goto out_err;
2223 }
2224
9f59d175 2225 ret = fgetxattr(fd, name, value, size);
7387863d
DDAG
2226 if (ret == -1) {
2227 goto out_err;
2228 }
2229 saverr = 0;
2230 if (ret == 0) {
2231 goto out;
2232 }
2233
2234 fuse_reply_buf(req, value, ret);
2235 } else {
9f59d175 2236 ret = fgetxattr(fd, name, NULL, 0);
7387863d
DDAG
2237 if (ret == -1) {
2238 goto out_err;
2239 }
2240
2241 fuse_reply_xattr(req, ret);
2242 }
7c6b6602 2243out_free:
7387863d 2244 free(value);
9f59d175
SH
2245
2246 if (fd >= 0) {
2247 close(fd);
2248 }
c241aa94
SH
2249
2250 lo_inode_put(lo, &inode);
7387863d 2251 return;
7c6b6602
DDAG
2252
2253out_err:
7387863d 2254 saverr = errno;
7c6b6602 2255out:
c241aa94 2256 lo_inode_put(lo, &inode);
7387863d
DDAG
2257 fuse_reply_err(req, saverr);
2258 goto out_free;
7c6b6602
DDAG
2259}
2260
2261static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size)
2262{
9f59d175 2263 struct lo_data *lo = lo_data(req);
7387863d
DDAG
2264 char *value = NULL;
2265 char procname[64];
92fb57b8 2266 struct lo_inode *inode;
7387863d
DDAG
2267 ssize_t ret;
2268 int saverr;
9f59d175 2269 int fd = -1;
7387863d 2270
92fb57b8
SH
2271 inode = lo_inode(req, ino);
2272 if (!inode) {
2273 fuse_reply_err(req, EBADF);
2274 return;
2275 }
2276
7387863d
DDAG
2277 saverr = ENOSYS;
2278 if (!lo_data(req)->xattr) {
2279 goto out;
2280 }
2281
d240314a
EG
2282 fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", ino,
2283 size);
7387863d
DDAG
2284
2285 if (inode->is_symlink) {
2286 /* Sorry, no race free way to listxattr on symlink. */
2287 saverr = EPERM;
2288 goto out;
2289 }
2290
9f59d175
SH
2291 sprintf(procname, "%i", inode->fd);
2292 fd = openat(lo->proc_self_fd, procname, O_RDONLY);
2293 if (fd < 0) {
2294 goto out_err;
2295 }
7387863d
DDAG
2296
2297 if (size) {
2298 value = malloc(size);
2299 if (!value) {
2300 goto out_err;
2301 }
2302
9f59d175 2303 ret = flistxattr(fd, value, size);
7387863d
DDAG
2304 if (ret == -1) {
2305 goto out_err;
2306 }
2307 saverr = 0;
2308 if (ret == 0) {
2309 goto out;
2310 }
2311
2312 fuse_reply_buf(req, value, ret);
2313 } else {
9f59d175 2314 ret = flistxattr(fd, NULL, 0);
7387863d
DDAG
2315 if (ret == -1) {
2316 goto out_err;
2317 }
2318
2319 fuse_reply_xattr(req, ret);
2320 }
7c6b6602 2321out_free:
7387863d 2322 free(value);
9f59d175
SH
2323
2324 if (fd >= 0) {
2325 close(fd);
2326 }
c241aa94
SH
2327
2328 lo_inode_put(lo, &inode);
7387863d 2329 return;
7c6b6602
DDAG
2330
2331out_err:
7387863d 2332 saverr = errno;
7c6b6602 2333out:
c241aa94 2334 lo_inode_put(lo, &inode);
7387863d
DDAG
2335 fuse_reply_err(req, saverr);
2336 goto out_free;
7c6b6602
DDAG
2337}
2338
2339static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *name,
7387863d 2340 const char *value, size_t size, int flags)
7c6b6602 2341{
7387863d 2342 char procname[64];
9f59d175 2343 struct lo_data *lo = lo_data(req);
92fb57b8 2344 struct lo_inode *inode;
7387863d
DDAG
2345 ssize_t ret;
2346 int saverr;
9f59d175 2347 int fd = -1;
7c6b6602 2348
92fb57b8
SH
2349 inode = lo_inode(req, ino);
2350 if (!inode) {
2351 fuse_reply_err(req, EBADF);
2352 return;
2353 }
2354
7387863d
DDAG
2355 saverr = ENOSYS;
2356 if (!lo_data(req)->xattr) {
2357 goto out;
2358 }
7c6b6602 2359
d240314a
EG
2360 fuse_log(FUSE_LOG_DEBUG, "lo_setxattr(ino=%" PRIu64
2361 ", name=%s value=%s size=%zd)\n", ino, name, value, size);
7c6b6602 2362
7387863d
DDAG
2363 if (inode->is_symlink) {
2364 /* Sorry, no race free way to setxattr on symlink. */
2365 saverr = EPERM;
2366 goto out;
2367 }
7c6b6602 2368
9f59d175
SH
2369 sprintf(procname, "%i", inode->fd);
2370 fd = openat(lo->proc_self_fd, procname, O_RDWR);
2371 if (fd < 0) {
2372 saverr = errno;
2373 goto out;
2374 }
7c6b6602 2375
9f59d175 2376 ret = fsetxattr(fd, name, value, size, flags);
7387863d 2377 saverr = ret == -1 ? errno : 0;
7c6b6602
DDAG
2378
2379out:
9f59d175
SH
2380 if (fd >= 0) {
2381 close(fd);
2382 }
c241aa94
SH
2383
2384 lo_inode_put(lo, &inode);
7387863d 2385 fuse_reply_err(req, saverr);
7c6b6602
DDAG
2386}
2387
2388static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *name)
2389{
7387863d 2390 char procname[64];
9f59d175 2391 struct lo_data *lo = lo_data(req);
92fb57b8 2392 struct lo_inode *inode;
7387863d
DDAG
2393 ssize_t ret;
2394 int saverr;
9f59d175 2395 int fd = -1;
7c6b6602 2396
92fb57b8
SH
2397 inode = lo_inode(req, ino);
2398 if (!inode) {
2399 fuse_reply_err(req, EBADF);
2400 return;
2401 }
2402
7387863d
DDAG
2403 saverr = ENOSYS;
2404 if (!lo_data(req)->xattr) {
2405 goto out;
2406 }
7c6b6602 2407
d240314a
EG
2408 fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", ino,
2409 name);
7c6b6602 2410
7387863d
DDAG
2411 if (inode->is_symlink) {
2412 /* Sorry, no race free way to setxattr on symlink. */
2413 saverr = EPERM;
2414 goto out;
2415 }
7c6b6602 2416
9f59d175
SH
2417 sprintf(procname, "%i", inode->fd);
2418 fd = openat(lo->proc_self_fd, procname, O_RDWR);
2419 if (fd < 0) {
2420 saverr = errno;
2421 goto out;
2422 }
7c6b6602 2423
9f59d175 2424 ret = fremovexattr(fd, name);
7387863d 2425 saverr = ret == -1 ? errno : 0;
7c6b6602
DDAG
2426
2427out:
9f59d175
SH
2428 if (fd >= 0) {
2429 close(fd);
2430 }
c241aa94
SH
2431
2432 lo_inode_put(lo, &inode);
7387863d 2433 fuse_reply_err(req, saverr);
7c6b6602
DDAG
2434}
2435
2436#ifdef HAVE_COPY_FILE_RANGE
2437static void lo_copy_file_range(fuse_req_t req, fuse_ino_t ino_in, off_t off_in,
7387863d
DDAG
2438 struct fuse_file_info *fi_in, fuse_ino_t ino_out,
2439 off_t off_out, struct fuse_file_info *fi_out,
2440 size_t len, int flags)
7c6b6602 2441{
73b4d19d 2442 int in_fd, out_fd;
7387863d
DDAG
2443 ssize_t res;
2444
73b4d19d
SH
2445 in_fd = lo_fi_fd(req, fi_in);
2446 out_fd = lo_fi_fd(req, fi_out);
2447
2448 fuse_log(FUSE_LOG_DEBUG,
2449 "lo_copy_file_range(ino=%" PRIu64 "/fd=%d, "
2450 "off=%lu, ino=%" PRIu64 "/fd=%d, "
2451 "off=%lu, size=%zd, flags=0x%x)\n",
2452 ino_in, in_fd, off_in, ino_out, out_fd, off_out, len, flags);
7387863d 2453
73b4d19d 2454 res = copy_file_range(in_fd, &off_in, out_fd, &off_out, len, flags);
7387863d
DDAG
2455 if (res < 0) {
2456 fuse_reply_err(req, -errno);
2457 } else {
2458 fuse_reply_write(req, res);
2459 }
7c6b6602
DDAG
2460}
2461#endif
2462
2463static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence,
7387863d 2464 struct fuse_file_info *fi)
7c6b6602 2465{
7387863d
DDAG
2466 off_t res;
2467
2468 (void)ino;
73b4d19d 2469 res = lseek(lo_fi_fd(req, fi), off, whence);
7387863d
DDAG
2470 if (res != -1) {
2471 fuse_reply_lseek(req, res);
2472 } else {
2473 fuse_reply_err(req, errno);
2474 }
7c6b6602
DDAG
2475}
2476
771b01eb
DDAG
2477static void lo_destroy(void *userdata)
2478{
2479 struct lo_data *lo = (struct lo_data *)userdata;
2480 unref_all_inodes(lo);
2481}
2482
7c6b6602 2483static struct fuse_lowlevel_ops lo_oper = {
7387863d
DDAG
2484 .init = lo_init,
2485 .lookup = lo_lookup,
2486 .mkdir = lo_mkdir,
2487 .mknod = lo_mknod,
2488 .symlink = lo_symlink,
2489 .link = lo_link,
2490 .unlink = lo_unlink,
2491 .rmdir = lo_rmdir,
2492 .rename = lo_rename,
2493 .forget = lo_forget,
2494 .forget_multi = lo_forget_multi,
2495 .getattr = lo_getattr,
2496 .setattr = lo_setattr,
2497 .readlink = lo_readlink,
2498 .opendir = lo_opendir,
2499 .readdir = lo_readdir,
2500 .readdirplus = lo_readdirplus,
2501 .releasedir = lo_releasedir,
2502 .fsyncdir = lo_fsyncdir,
2503 .create = lo_create,
0e81414c
VG
2504 .getlk = lo_getlk,
2505 .setlk = lo_setlk,
7387863d
DDAG
2506 .open = lo_open,
2507 .release = lo_release,
2508 .flush = lo_flush,
2509 .fsync = lo_fsync,
2510 .read = lo_read,
2511 .write_buf = lo_write_buf,
2512 .statfs = lo_statfs,
2513 .fallocate = lo_fallocate,
2514 .flock = lo_flock,
2515 .getxattr = lo_getxattr,
2516 .listxattr = lo_listxattr,
2517 .setxattr = lo_setxattr,
2518 .removexattr = lo_removexattr,
7c6b6602 2519#ifdef HAVE_COPY_FILE_RANGE
7387863d 2520 .copy_file_range = lo_copy_file_range,
7c6b6602 2521#endif
7387863d 2522 .lseek = lo_lseek,
771b01eb 2523 .destroy = lo_destroy,
7c6b6602
DDAG
2524};
2525
45018fbb
SH
2526/* Print vhost-user.json backend program capabilities */
2527static void print_capabilities(void)
2528{
2529 printf("{\n");
2530 printf(" \"type\": \"fs\"\n");
2531 printf("}\n");
2532}
2533
d74830d1 2534/*
8e1d4ef2 2535 * Move to a new mount, net, and pid namespaces to isolate this process.
d74830d1 2536 */
8e1d4ef2 2537static void setup_namespaces(struct lo_data *lo, struct fuse_session *se)
d74830d1 2538{
8e1d4ef2
SH
2539 pid_t child;
2540
2541 /*
2542 * Create a new pid namespace for *child* processes. We'll have to
2543 * fork in order to enter the new pid namespace. A new mount namespace
2544 * is also needed so that we can remount /proc for the new pid
2545 * namespace.
2546 *
2547 * Our UNIX domain sockets have been created. Now we can move to
2548 * an empty network namespace to prevent TCP/IP and other network
2549 * activity in case this process is compromised.
2550 */
2551 if (unshare(CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWNET) != 0) {
2552 fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWPID | CLONE_NEWNS): %m\n");
2553 exit(1);
2554 }
2555
2556 child = fork();
2557 if (child < 0) {
2558 fuse_log(FUSE_LOG_ERR, "fork() failed: %m\n");
2559 exit(1);
2560 }
2561 if (child > 0) {
2562 pid_t waited;
2563 int wstatus;
2564
2565 /* The parent waits for the child */
2566 do {
2567 waited = waitpid(child, &wstatus, 0);
2568 } while (waited < 0 && errno == EINTR && !se->exited);
2569
2570 /* We were terminated by a signal, see fuse_signals.c */
2571 if (se->exited) {
2572 exit(0);
2573 }
2574
2575 if (WIFEXITED(wstatus)) {
2576 exit(WEXITSTATUS(wstatus));
2577 }
2578
2579 exit(1);
2580 }
2581
2582 /* Send us SIGTERM when the parent thread terminates, see prctl(2) */
2583 prctl(PR_SET_PDEATHSIG, SIGTERM);
2584
2585 /*
2586 * If the mounts have shared propagation then we want to opt out so our
2587 * mount changes don't affect the parent mount namespace.
2588 */
2589 if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL) < 0) {
2590 fuse_log(FUSE_LOG_ERR, "mount(/, MS_REC|MS_SLAVE): %m\n");
2591 exit(1);
2592 }
2593
2594 /* The child must remount /proc to use the new pid namespace */
2595 if (mount("proc", "/proc", "proc",
2596 MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RELATIME, NULL) < 0) {
2597 fuse_log(FUSE_LOG_ERR, "mount(/proc): %m\n");
2598 exit(1);
2599 }
2600
2601 /* Now we can get our /proc/self/fd directory file descriptor */
2602 lo->proc_self_fd = open("/proc/self/fd", O_PATH);
2603 if (lo->proc_self_fd == -1) {
2604 fuse_log(FUSE_LOG_ERR, "open(/proc/self/fd, O_PATH): %m\n");
d74830d1
SH
2605 exit(1);
2606 }
2607}
2608
2405f3c0
DDAG
2609/*
2610 * Capture the capability state, we'll need to restore this for individual
2611 * threads later; see load_capng.
2612 */
2613static void setup_capng(void)
2614{
2615 /* Note this accesses /proc so has to happen before the sandbox */
2616 if (capng_get_caps_process()) {
2617 fuse_log(FUSE_LOG_ERR, "capng_get_caps_process\n");
2618 exit(1);
2619 }
2620 pthread_mutex_init(&cap.mutex, NULL);
2621 pthread_mutex_lock(&cap.mutex);
2622 cap.saved = capng_save_state();
2623 if (!cap.saved) {
2624 fuse_log(FUSE_LOG_ERR, "capng_save_state\n");
2625 exit(1);
2626 }
2627 pthread_mutex_unlock(&cap.mutex);
2628}
2629
2630static void cleanup_capng(void)
2631{
2632 free(cap.saved);
2633 cap.saved = NULL;
2634 pthread_mutex_destroy(&cap.mutex);
2635}
2636
2637
8e1d4ef2
SH
2638/*
2639 * Make the source directory our root so symlinks cannot escape and no other
2640 * files are accessible. Assumes unshare(CLONE_NEWNS) was already called.
2641 */
2642static void setup_mounts(const char *source)
5baa3b8e
SH
2643{
2644 int oldroot;
2645 int newroot;
2646
8e1d4ef2
SH
2647 if (mount(source, source, NULL, MS_BIND, NULL) < 0) {
2648 fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source);
2649 exit(1);
2650 }
2651
2652 /* This magic is based on lxc's lxc_pivot_root() */
5baa3b8e
SH
2653 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
2654 if (oldroot < 0) {
2655 fuse_log(FUSE_LOG_ERR, "open(/): %m\n");
2656 exit(1);
2657 }
2658
2659 newroot = open(source, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
2660 if (newroot < 0) {
2661 fuse_log(FUSE_LOG_ERR, "open(%s): %m\n", source);
2662 exit(1);
2663 }
2664
2665 if (fchdir(newroot) < 0) {
2666 fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n");
2667 exit(1);
2668 }
2669
2670 if (syscall(__NR_pivot_root, ".", ".") < 0) {
2671 fuse_log(FUSE_LOG_ERR, "pivot_root(., .): %m\n");
2672 exit(1);
2673 }
2674
2675 if (fchdir(oldroot) < 0) {
2676 fuse_log(FUSE_LOG_ERR, "fchdir(oldroot): %m\n");
2677 exit(1);
2678 }
2679
2680 if (mount("", ".", "", MS_SLAVE | MS_REC, NULL) < 0) {
2681 fuse_log(FUSE_LOG_ERR, "mount(., MS_SLAVE | MS_REC): %m\n");
2682 exit(1);
2683 }
2684
2685 if (umount2(".", MNT_DETACH) < 0) {
2686 fuse_log(FUSE_LOG_ERR, "umount2(., MNT_DETACH): %m\n");
2687 exit(1);
2688 }
2689
2690 if (fchdir(newroot) < 0) {
2691 fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n");
2692 exit(1);
2693 }
2694
2695 close(newroot);
2696 close(oldroot);
2697}
2698
5baa3b8e
SH
2699/*
2700 * Lock down this process to prevent access to other processes or files outside
2701 * source directory. This reduces the impact of arbitrary code execution bugs.
2702 */
f185621d
SH
2703static void setup_sandbox(struct lo_data *lo, struct fuse_session *se,
2704 bool enable_syslog)
5baa3b8e 2705{
8e1d4ef2
SH
2706 setup_namespaces(lo, se);
2707 setup_mounts(lo->source);
f185621d 2708 setup_seccomp(enable_syslog);
5baa3b8e
SH
2709}
2710
01a6dc95
SH
2711/* Raise the maximum number of open file descriptors */
2712static void setup_nofile_rlimit(void)
2713{
2714 const rlim_t max_fds = 1000000;
2715 struct rlimit rlim;
2716
2717 if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) {
2718 fuse_log(FUSE_LOG_ERR, "getrlimit(RLIMIT_NOFILE): %m\n");
2719 exit(1);
2720 }
2721
2722 if (rlim.rlim_cur >= max_fds) {
2723 return; /* nothing to do */
2724 }
2725
2726 rlim.rlim_cur = max_fds;
2727 rlim.rlim_max = max_fds;
2728
2729 if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) {
2730 /* Ignore SELinux denials */
2731 if (errno == EPERM) {
2732 return;
2733 }
2734
2735 fuse_log(FUSE_LOG_ERR, "setrlimit(RLIMIT_NOFILE): %m\n");
2736 exit(1);
2737 }
2738}
2739
f185621d
SH
2740static void log_func(enum fuse_log_level level, const char *fmt, va_list ap)
2741{
36f38469
MM
2742 g_autofree char *localfmt = NULL;
2743
d240314a
EG
2744 if (current_log_level < level) {
2745 return;
2746 }
2747
36f38469 2748 if (current_log_level == FUSE_LOG_DEBUG) {
50fb955a
MM
2749 if (!use_syslog) {
2750 localfmt = g_strdup_printf("[%" PRId64 "] [ID: %08ld] %s",
2751 get_clock(), syscall(__NR_gettid), fmt);
2752 } else {
2753 localfmt = g_strdup_printf("[ID: %08ld] %s", syscall(__NR_gettid),
2754 fmt);
2755 }
36f38469
MM
2756 fmt = localfmt;
2757 }
2758
f185621d
SH
2759 if (use_syslog) {
2760 int priority = LOG_ERR;
2761 switch (level) {
2762 case FUSE_LOG_EMERG:
2763 priority = LOG_EMERG;
2764 break;
2765 case FUSE_LOG_ALERT:
2766 priority = LOG_ALERT;
2767 break;
2768 case FUSE_LOG_CRIT:
2769 priority = LOG_CRIT;
2770 break;
2771 case FUSE_LOG_ERR:
2772 priority = LOG_ERR;
2773 break;
2774 case FUSE_LOG_WARNING:
2775 priority = LOG_WARNING;
2776 break;
2777 case FUSE_LOG_NOTICE:
2778 priority = LOG_NOTICE;
2779 break;
2780 case FUSE_LOG_INFO:
2781 priority = LOG_INFO;
2782 break;
2783 case FUSE_LOG_DEBUG:
2784 priority = LOG_DEBUG;
2785 break;
2786 }
2787 vsyslog(priority, fmt, ap);
2788 } else {
2789 vfprintf(stderr, fmt, ap);
2790 }
2791}
2792
3ca8a2b1
MS
2793static void setup_root(struct lo_data *lo, struct lo_inode *root)
2794{
2795 int fd, res;
2796 struct stat stat;
2797
2798 fd = open("/", O_PATH);
2799 if (fd == -1) {
2800 fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", lo->source);
2801 exit(1);
2802 }
2803
2804 res = fstatat(fd, "", &stat, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
2805 if (res == -1) {
2806 fuse_log(FUSE_LOG_ERR, "fstatat(%s): %m\n", lo->source);
2807 exit(1);
2808 }
2809
2810 root->is_symlink = false;
2811 root->fd = fd;
bfc50a6e
MS
2812 root->key.ino = stat.st_ino;
2813 root->key.dev = stat.st_dev;
1222f015 2814 root->nlookup = 2;
c241aa94 2815 g_atomic_int_set(&root->refcount, 2);
3ca8a2b1
MS
2816}
2817
bfc50a6e
MS
2818static guint lo_key_hash(gconstpointer key)
2819{
2820 const struct lo_key *lkey = key;
2821
2822 return (guint)lkey->ino + (guint)lkey->dev;
2823}
2824
2825static gboolean lo_key_equal(gconstpointer a, gconstpointer b)
2826{
2827 const struct lo_key *la = a;
2828 const struct lo_key *lb = b;
2829
2830 return la->ino == lb->ino && la->dev == lb->dev;
2831}
2832
18a69cbb
LB
2833static void fuse_lo_data_cleanup(struct lo_data *lo)
2834{
2835 if (lo->inodes) {
2836 g_hash_table_destroy(lo->inodes);
2837 }
2838 lo_map_destroy(&lo->fd_map);
2839 lo_map_destroy(&lo->dirp_map);
2840 lo_map_destroy(&lo->ino_map);
2841
2842 if (lo->proc_self_fd >= 0) {
2843 close(lo->proc_self_fd);
2844 }
2845
2846 if (lo->root.fd >= 0) {
2847 close(lo->root.fd);
2848 }
2849
2850 free(lo->source);
2851}
2852
7c6b6602
DDAG
2853int main(int argc, char *argv[])
2854{
7387863d
DDAG
2855 struct fuse_args args = FUSE_ARGS_INIT(argc, argv);
2856 struct fuse_session *se;
2857 struct fuse_cmdline_opts opts;
9f59d175
SH
2858 struct lo_data lo = {
2859 .debug = 0,
2860 .writeback = 0,
0e81414c 2861 .posix_lock = 1,
9f59d175
SH
2862 .proc_self_fd = -1,
2863 };
92fb57b8 2864 struct lo_map_elem *root_elem;
7387863d
DDAG
2865 int ret = -1;
2866
2867 /* Don't mask creation mode, kernel already did that */
2868 umask(0);
2869
2870 pthread_mutex_init(&lo.mutex, NULL);
bfc50a6e 2871 lo.inodes = g_hash_table_new(lo_key_hash, lo_key_equal);
7387863d 2872 lo.root.fd = -1;
92fb57b8 2873 lo.root.fuse_ino = FUSE_ROOT_ID;
230e777b 2874 lo.cache = CACHE_AUTO;
7387863d 2875
92fb57b8
SH
2876 /*
2877 * Set up the ino map like this:
2878 * [0] Reserved (will not be used)
2879 * [1] Root inode
2880 */
2881 lo_map_init(&lo.ino_map);
2882 lo_map_reserve(&lo.ino_map, 0)->in_use = false;
2883 root_elem = lo_map_reserve(&lo.ino_map, lo.root.fuse_ino);
2884 root_elem->inode = &lo.root;
2885
b39bce12 2886 lo_map_init(&lo.dirp_map);
73b4d19d 2887 lo_map_init(&lo.fd_map);
b39bce12 2888
7387863d 2889 if (fuse_parse_cmdline(&args, &opts) != 0) {
c6de8046 2890 goto err_out1;
7387863d 2891 }
f185621d
SH
2892 fuse_set_log_func(log_func);
2893 use_syslog = opts.syslog;
2894 if (use_syslog) {
2895 openlog("virtiofsd", LOG_PID, LOG_DAEMON);
2896 }
c6de8046 2897
7387863d 2898 if (opts.show_help) {
67aab022 2899 printf("usage: %s [options]\n\n", argv[0]);
7387863d 2900 fuse_cmdline_help();
4ff075f7 2901 printf(" -o source=PATH shared directory tree\n");
7387863d
DDAG
2902 fuse_lowlevel_help();
2903 ret = 0;
2904 goto err_out1;
2905 } else if (opts.show_version) {
2906 fuse_lowlevel_version();
2907 ret = 0;
2908 goto err_out1;
45018fbb
SH
2909 } else if (opts.print_capabilities) {
2910 print_capabilities();
2911 ret = 0;
2912 goto err_out1;
7387863d
DDAG
2913 }
2914
7387863d 2915 if (fuse_opt_parse(&args, &lo, lo_opts, NULL) == -1) {
c6de8046 2916 goto err_out1;
7387863d
DDAG
2917 }
2918
d240314a
EG
2919 /*
2920 * log_level is 0 if not configured via cmd options (0 is LOG_EMERG,
2921 * and we don't use this log level).
2922 */
2923 if (opts.log_level != 0) {
2924 current_log_level = opts.log_level;
2925 }
7387863d 2926 lo.debug = opts.debug;
d240314a
EG
2927 if (lo.debug) {
2928 current_log_level = FUSE_LOG_DEBUG;
2929 }
7387863d
DDAG
2930 if (lo.source) {
2931 struct stat stat;
2932 int res;
2933
2934 res = lstat(lo.source, &stat);
2935 if (res == -1) {
2936 fuse_log(FUSE_LOG_ERR, "failed to stat source (\"%s\"): %m\n",
2937 lo.source);
2938 exit(1);
2939 }
2940 if (!S_ISDIR(stat.st_mode)) {
2941 fuse_log(FUSE_LOG_ERR, "source is not a directory\n");
2942 exit(1);
2943 }
7387863d 2944 } else {
eb68a33b 2945 lo.source = strdup("/");
7387863d 2946 }
7387863d
DDAG
2947 if (!lo.timeout_set) {
2948 switch (lo.cache) {
230e777b 2949 case CACHE_NONE:
7387863d
DDAG
2950 lo.timeout = 0.0;
2951 break;
2952
230e777b 2953 case CACHE_AUTO:
7387863d
DDAG
2954 lo.timeout = 1.0;
2955 break;
2956
2957 case CACHE_ALWAYS:
2958 lo.timeout = 86400.0;
2959 break;
2960 }
2961 } else if (lo.timeout < 0) {
2962 fuse_log(FUSE_LOG_ERR, "timeout is negative (%lf)\n", lo.timeout);
2963 exit(1);
2964 }
2965
7387863d
DDAG
2966 se = fuse_session_new(&args, &lo_oper, sizeof(lo_oper), &lo);
2967 if (se == NULL) {
2968 goto err_out1;
2969 }
2970
2971 if (fuse_set_signal_handlers(se) != 0) {
2972 goto err_out2;
2973 }
2974
67aab022 2975 if (fuse_session_mount(se) != 0) {
7387863d
DDAG
2976 goto err_out3;
2977 }
2978
2979 fuse_daemonize(opts.foreground);
2980
01a6dc95
SH
2981 setup_nofile_rlimit();
2982
2405f3c0
DDAG
2983 /* Must be before sandbox since it wants /proc */
2984 setup_capng();
2985
f185621d 2986 setup_sandbox(&lo, se, opts.syslog);
5baa3b8e 2987
3ca8a2b1 2988 setup_root(&lo, &lo.root);
7387863d 2989 /* Block until ctrl+c or fusermount -u */
f6f3573c 2990 ret = virtio_loop(se);
7387863d
DDAG
2991
2992 fuse_session_unmount(se);
2405f3c0 2993 cleanup_capng();
7c6b6602 2994err_out3:
7387863d 2995 fuse_remove_signal_handlers(se);
7c6b6602 2996err_out2:
7387863d 2997 fuse_session_destroy(se);
7c6b6602 2998err_out1:
7387863d 2999 fuse_opt_free_args(&args);
7c6b6602 3000
18a69cbb 3001 fuse_lo_data_cleanup(&lo);
eb68a33b 3002
7387863d 3003 return ret ? 1 : 0;
7c6b6602 3004}