[mirror_ubuntu-jammy-kernel.git] / kernel / bpf / inode.c

// SPDX-License-Identifier: GPL-2.0-only
/*
 * Minimal file system backend for holding eBPF maps and programs,
 * used by bpf(2) object pinning.
 *
 * Authors:
 *
 *	Daniel Borkmann <daniel@iogearbox.net>
 */

#include <linux/init.h>
#include <linux/magic.h>
#include <linux/major.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/fs.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/kdev_t.h>
#include <linux/filter.h>
#include <linux/bpf.h>
#include <linux/bpf_trace.h>

enum bpf_type {
	BPF_TYPE_UNSPEC	= 0,
	BPF_TYPE_PROG,
	BPF_TYPE_MAP,
};

static void *bpf_any_get(void *raw, enum bpf_type type)
{
	switch (type) {
	case BPF_TYPE_PROG:
		bpf_prog_inc(raw);
		break;
	case BPF_TYPE_MAP:
		bpf_map_inc_with_uref(raw);
		break;
	default:
		WARN_ON_ONCE(1);
		break;
	}

	return raw;
}

static void bpf_any_put(void *raw, enum bpf_type type)
{
	switch (type) {
	case BPF_TYPE_PROG:
		bpf_prog_put(raw);
		break;
	case BPF_TYPE_MAP:
		bpf_map_put_with_uref(raw);
		break;
	default:
		WARN_ON_ONCE(1);
		break;
	}
}

static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type)
{
	void *raw;

	*type = BPF_TYPE_MAP;
	raw = bpf_map_get_with_uref(ufd);
	if (IS_ERR(raw)) {
		*type = BPF_TYPE_PROG;
		raw = bpf_prog_get(ufd);
	}

	return raw;
}

static const struct inode_operations bpf_dir_iops;

static const struct inode_operations bpf_prog_iops = { };
static const struct inode_operations bpf_map_iops  = { };

static struct inode *bpf_get_inode(struct super_block *sb,
				   const struct inode *dir,
				   umode_t mode)
{
	struct inode *inode;

	switch (mode & S_IFMT) {
	case S_IFDIR:
	case S_IFREG:
	case S_IFLNK:
		break;
	default:
		return ERR_PTR(-EINVAL);
	}

	inode = new_inode(sb);
	if (!inode)
		return ERR_PTR(-ENOSPC);

	inode->i_ino = get_next_ino();
	inode->i_atime = current_time(inode);
	inode->i_mtime = inode->i_atime;
	inode->i_ctime = inode->i_atime;

	inode_init_owner(inode, dir, mode);

	return inode;
}

static int bpf_inode_type(const struct inode *inode, enum bpf_type *type)
{
	*type = BPF_TYPE_UNSPEC;
	if (inode->i_op == &bpf_prog_iops)
		*type = BPF_TYPE_PROG;
	else if (inode->i_op == &bpf_map_iops)
		*type = BPF_TYPE_MAP;
	else
		return -EACCES;

	return 0;
}

static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,
				struct inode *dir)
{
	d_instantiate(dentry, inode);
	dget(dentry);

	dir->i_mtime = current_time(dir);
	dir->i_ctime = dir->i_mtime;
}

static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
	struct inode *inode;

	inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR);
	if (IS_ERR(inode))
		return PTR_ERR(inode);

	inode->i_op = &bpf_dir_iops;
	inode->i_fop = &simple_dir_operations;

	inc_nlink(inode);
	inc_nlink(dir);

	bpf_dentry_finalize(dentry, inode, dir);
	return 0;
}

struct map_iter {
	void *key;
	bool done;
};

static struct map_iter *map_iter(struct seq_file *m)
{
	return m->private;
}

static struct bpf_map *seq_file_to_map(struct seq_file *m)
{
	return file_inode(m->file)->i_private;
}

static void map_iter_free(struct map_iter *iter)
{
	if (iter) {
		kfree(iter->key);
		kfree(iter);
	}
}

static struct map_iter *map_iter_alloc(struct bpf_map *map)
{
	struct map_iter *iter;

	iter = kzalloc(sizeof(*iter), GFP_KERNEL | __GFP_NOWARN);
	if (!iter)
		goto error;

	iter->key = kzalloc(map->key_size, GFP_KERNEL | __GFP_NOWARN);
	if (!iter->key)
		goto error;

	return iter;

error:
	map_iter_free(iter);
	return NULL;
}

static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
{
	struct bpf_map *map = seq_file_to_map(m);
	void *key = map_iter(m)->key;
	void *prev_key;

	if (map_iter(m)->done)
		return NULL;

	if (unlikely(v == SEQ_START_TOKEN))
		prev_key = NULL;
	else
		prev_key = key;

	if (map->ops->map_get_next_key(map, prev_key, key)) {
		map_iter(m)->done = true;
		return NULL;
	}

	++(*pos);
	return key;
}

static void *map_seq_start(struct seq_file *m, loff_t *pos)
{
	if (map_iter(m)->done)
		return NULL;

	return *pos ? map_iter(m)->key : SEQ_START_TOKEN;
}

static void map_seq_stop(struct seq_file *m, void *v)
{
}

static int map_seq_show(struct seq_file *m, void *v)
{
	struct bpf_map *map = seq_file_to_map(m);
	void *key = map_iter(m)->key;

	if (unlikely(v == SEQ_START_TOKEN)) {
		seq_puts(m, "# WARNING!! The output is for debug purpose only\n");
		seq_puts(m, "# WARNING!! The output format will change\n");
	} else {
		map->ops->map_seq_show_elem(map, key, m);
	}

	return 0;
}

static const struct seq_operations bpffs_map_seq_ops = {
	.start	= map_seq_start,
	.next	= map_seq_next,
	.show	= map_seq_show,
	.stop	= map_seq_stop,
};

static int bpffs_map_open(struct inode *inode, struct file *file)
{
	struct bpf_map *map = inode->i_private;
	struct map_iter *iter;
	struct seq_file *m;
	int err;

	iter = map_iter_alloc(map);
	if (!iter)
		return -ENOMEM;

	err = seq_open(file, &bpffs_map_seq_ops);
	if (err) {
		map_iter_free(iter);
		return err;
	}

	m = file->private_data;
	m->private = iter;

	return 0;
}

static int bpffs_map_release(struct inode *inode, struct file *file)
{
	struct seq_file *m = file->private_data;

	map_iter_free(map_iter(m));

	return seq_release(inode, file);
}

/* bpffs_map_fops should only implement the basic
 * read operation for a BPF map.  The purpose is to
 * provide a simple user intuitive way to do
 * "cat bpffs/pathto/a-pinned-map".
 *
 * Other operations (e.g. write, lookup...) should be realized by
 * the userspace tools (e.g. bpftool) through the
 * BPF_OBJ_GET_INFO_BY_FD and the map's lookup/update
 * interface.
 */
static const struct file_operations bpffs_map_fops = {
	.open		= bpffs_map_open,
	.read		= seq_read,
	.release	= bpffs_map_release,
};

static int bpffs_obj_open(struct inode *inode, struct file *file)
{
	return -EIO;
}

static const struct file_operations bpffs_obj_fops = {
	.open		= bpffs_obj_open,
};

static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
			 const struct inode_operations *iops,
			 const struct file_operations *fops)
{
	struct inode *dir = dentry->d_parent->d_inode;
	struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode);
	if (IS_ERR(inode))
		return PTR_ERR(inode);

	inode->i_op = iops;
	inode->i_fop = fops;
	inode->i_private = raw;

	bpf_dentry_finalize(dentry, inode, dir);
	return 0;
}

static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg)
{
	return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops,
			     &bpffs_obj_fops);
}

static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg)
{
	struct bpf_map *map = arg;

	return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops,
			     bpf_map_support_seq_show(map) ?
			     &bpffs_map_fops : &bpffs_obj_fops);
}

static struct dentry *
bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
{
	/* Dots in names (e.g. "/sys/fs/bpf/foo.bar") are reserved for future
	 * extensions.
	 */
	if (strchr(dentry->d_name.name, '.'))
		return ERR_PTR(-EPERM);

	return simple_lookup(dir, dentry, flags);
}

static int bpf_symlink(struct inode *dir, struct dentry *dentry,
		       const char *target)
{
	char *link = kstrdup(target, GFP_USER | __GFP_NOWARN);
	struct inode *inode;

	if (!link)
		return -ENOMEM;

	inode = bpf_get_inode(dir->i_sb, dir, S_IRWXUGO | S_IFLNK);
	if (IS_ERR(inode)) {
		kfree(link);
		return PTR_ERR(inode);
	}

	inode->i_op = &simple_symlink_inode_operations;
	inode->i_link = link;

	bpf_dentry_finalize(dentry, inode, dir);
	return 0;
}

static const struct inode_operations bpf_dir_iops = {
	.lookup		= bpf_lookup,
	.mkdir		= bpf_mkdir,
	.symlink	= bpf_symlink,
	.rmdir		= simple_rmdir,
	.rename		= simple_rename,
	.link		= simple_link,
	.unlink		= simple_unlink,
};

static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
			  enum bpf_type type)
{
	struct dentry *dentry;
	struct inode *dir;
	struct path path;
	umode_t mode;
	int ret;

	dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0);
	if (IS_ERR(dentry))
		return PTR_ERR(dentry);

	mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());

	ret = security_path_mknod(&path, dentry, mode, 0);
	if (ret)
		goto out;

	dir = d_inode(path.dentry);
	if (dir->i_op != &bpf_dir_iops) {
		ret = -EPERM;
		goto out;
	}

	switch (type) {
	case BPF_TYPE_PROG:
		ret = vfs_mkobj(dentry, mode, bpf_mkprog, raw);
		break;
	case BPF_TYPE_MAP:
		ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw);
		break;
	default:
		ret = -EPERM;
	}
out:
	done_path_create(&path, dentry);
	return ret;
}

int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
{
	struct filename *pname;
	enum bpf_type type;
	void *raw;
	int ret;

	pname = getname(pathname);
	if (IS_ERR(pname))
		return PTR_ERR(pname);

	raw = bpf_fd_probe_obj(ufd, &type);
	if (IS_ERR(raw)) {
		ret = PTR_ERR(raw);
		goto out;
	}

	ret = bpf_obj_do_pin(pname, raw, type);
	if (ret != 0)
		bpf_any_put(raw, type);
out:
	putname(pname);
	return ret;
}

static void *bpf_obj_do_get(const struct filename *pathname,
			    enum bpf_type *type, int flags)
{
	struct inode *inode;
	struct path path;
	void *raw;
	int ret;

	ret = kern_path(pathname->name, LOOKUP_FOLLOW, &path);
	if (ret)
		return ERR_PTR(ret);

	inode = d_backing_inode(path.dentry);
	ret = inode_permission(inode, ACC_MODE(flags));
	if (ret)
		goto out;

	ret = bpf_inode_type(inode, type);
	if (ret)
		goto out;

	raw = bpf_any_get(inode->i_private, *type);
	if (!IS_ERR(raw))
		touch_atime(&path);

	path_put(&path);
	return raw;
out:
	path_put(&path);
	return ERR_PTR(ret);
}

int bpf_obj_get_user(const char __user *pathname, int flags)
{
	enum bpf_type type = BPF_TYPE_UNSPEC;
	struct filename *pname;
	int ret = -ENOENT;
	int f_flags;
	void *raw;

	f_flags = bpf_get_file_flag(flags);
	if (f_flags < 0)
		return f_flags;

	pname = getname(pathname);
	if (IS_ERR(pname))
		return PTR_ERR(pname);

	raw = bpf_obj_do_get(pname, &type, f_flags);
	if (IS_ERR(raw)) {
		ret = PTR_ERR(raw);
		goto out;
	}

	if (type == BPF_TYPE_PROG)
		ret = bpf_prog_new_fd(raw);
	else if (type == BPF_TYPE_MAP)
		ret = bpf_map_new_fd(raw, f_flags);
	else
		goto out;

	if (ret < 0)
		bpf_any_put(raw, type);
out:
	putname(pname);
	return ret;
}

static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
{
	struct bpf_prog *prog;
	int ret = inode_permission(inode, MAY_READ);
	if (ret)
		return ERR_PTR(ret);

	if (inode->i_op == &bpf_map_iops)
		return ERR_PTR(-EINVAL);
	if (inode->i_op != &bpf_prog_iops)
		return ERR_PTR(-EACCES);

	prog = inode->i_private;

	ret = security_bpf_prog(prog);
	if (ret < 0)
		return ERR_PTR(ret);

	if (!bpf_prog_get_ok(prog, &type, false))
		return ERR_PTR(-EINVAL);

	bpf_prog_inc(prog);
	return prog;
}

struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type)
{
	struct bpf_prog *prog;
	struct path path;
	int ret = kern_path(name, LOOKUP_FOLLOW, &path);
	if (ret)
		return ERR_PTR(ret);
	prog = __get_prog_inode(d_backing_inode(path.dentry), type);
	if (!IS_ERR(prog))
		touch_atime(&path);
	path_put(&path);
	return prog;
}
EXPORT_SYMBOL(bpf_prog_get_type_path);

/*
 * Display the mount options in /proc/mounts.
 */
static int bpf_show_options(struct seq_file *m, struct dentry *root)
{
	umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX;

	if (mode != S_IRWXUGO)
		seq_printf(m, ",mode=%o", mode);
	return 0;
}

static void bpf_free_inode(struct inode *inode)
{
	enum bpf_type type;

	if (S_ISLNK(inode->i_mode))
		kfree(inode->i_link);
	if (!bpf_inode_type(inode, &type))
		bpf_any_put(inode->i_private, type);
	free_inode_nonrcu(inode);
}

static const struct super_operations bpf_super_ops = {
	.statfs		= simple_statfs,
	.drop_inode	= generic_delete_inode,
	.show_options	= bpf_show_options,
	.free_inode	= bpf_free_inode,
};

enum {
	OPT_MODE,
};

static const struct fs_parameter_spec bpf_param_specs[] = {
	fsparam_u32oct	("mode",			OPT_MODE),
	{}
};

static const struct fs_parameter_description bpf_fs_parameters = {
	.name		= "bpf",
	.specs		= bpf_param_specs,
};

struct bpf_mount_opts {
	umode_t mode;
};

static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
	struct bpf_mount_opts *opts = fc->fs_private;
	struct fs_parse_result result;
	int opt;

	opt = fs_parse(fc, &bpf_fs_parameters, param, &result);
	if (opt < 0)
		/* We might like to report bad mount options here, but
		 * traditionally we've ignored all mount options, so we'd
		 * better continue to ignore non-existing options for bpf.
		 */
		return opt == -ENOPARAM ? 0 : opt;

	switch (opt) {
	case OPT_MODE:
		opts->mode = result.uint_32 & S_IALLUGO;
		break;
	}

	return 0;
}

static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
{
	static const struct tree_descr bpf_rfiles[] = { { "" } };
	struct bpf_mount_opts *opts = fc->fs_private;
	struct inode *inode;
	int ret;

	ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);
	if (ret)
		return ret;

	sb->s_op = &bpf_super_ops;

	inode = sb->s_root->d_inode;
	inode->i_op = &bpf_dir_iops;
	inode->i_mode &= ~S_IALLUGO;
	inode->i_mode |= S_ISVTX | opts->mode;

	return 0;
}

static int bpf_get_tree(struct fs_context *fc)
{
	return get_tree_nodev(fc, bpf_fill_super);
}

static void bpf_free_fc(struct fs_context *fc)
{
	kfree(fc->fs_private);
}

static const struct fs_context_operations bpf_context_ops = {
	.free		= bpf_free_fc,
	.parse_param	= bpf_parse_param,
	.get_tree	= bpf_get_tree,
};

/*
 * Set up the filesystem mount context.
 */
static int bpf_init_fs_context(struct fs_context *fc)
{
	struct bpf_mount_opts *opts;

	opts = kzalloc(sizeof(struct bpf_mount_opts), GFP_KERNEL);
	if (!opts)
		return -ENOMEM;

	opts->mode = S_IRWXUGO;

	fc->fs_private = opts;
	fc->ops = &bpf_context_ops;
	return 0;
}

static struct file_system_type bpf_fs_type = {
	.owner		= THIS_MODULE,
	.name		= "bpf",
	.init_fs_context = bpf_init_fs_context,
	.parameters	= &bpf_fs_parameters,
	.kill_sb	= kill_litter_super,
};

static int __init bpf_init(void)
{
	int ret;

	ret = sysfs_create_mount_point(fs_kobj, "bpf");
	if (ret)
		return ret;

	ret = register_filesystem(&bpf_fs_type);
	if (ret)
		sysfs_remove_mount_point(fs_kobj, "bpf");

	return ret;
}
fs_initcall(bpf_init);
Commit	Line	Data
d2912cb1	1	// SPDX-License-Identifier: GPL-2.0-only
b2197755 DB	2	/*
	3	* Minimal file system backend for holding eBPF maps and programs,
	4	* used by bpf(2) object pinning.
	5	*
	6	* Authors:
	7	*
	8	* Daniel Borkmann <daniel@iogearbox.net>
b2197755 DB	9	*/
b2197755 DB	10
a536a6e1	11	#include <linux/init.h>
b2197755 DB	12	#include <linux/magic.h>
	13	#include <linux/major.h>
	14	#include <linux/mount.h>
	15	#include <linux/namei.h>
	16	#include <linux/fs.h>
d2935de7 DH	17	#include <linux/fs_context.h>
d2935de7 DH	18	#include <linux/fs_parser.h>
b2197755 DB	19	#include <linux/kdev_t.h>
	20	#include <linux/filter.h>
	21	#include <linux/bpf.h>
a67edbf4	22	#include <linux/bpf_trace.h>
b2197755 DB	23
	24	enum bpf_type {
	25	BPF_TYPE_UNSPEC = 0,
	26	BPF_TYPE_PROG,
	27	BPF_TYPE_MAP,
	28	};
	29
	30	static void bpf_any_get(void raw, enum bpf_type type)
	31	{
	32	switch (type) {
	33	case BPF_TYPE_PROG:
85192dbf	34	bpf_prog_inc(raw);
b2197755 DB	35	break;
b2197755 DB	36	case BPF_TYPE_MAP:
1e0bd5a0	37	bpf_map_inc_with_uref(raw);
b2197755 DB	38	break;
	39	default:
	40	WARN_ON_ONCE(1);
	41	break;
	42	}
	43
	44	return raw;
	45	}
	46
	47	static void bpf_any_put(void *raw, enum bpf_type type)
	48	{
	49	switch (type) {
	50	case BPF_TYPE_PROG:
	51	bpf_prog_put(raw);
	52	break;
	53	case BPF_TYPE_MAP:
c9da161c	54	bpf_map_put_with_uref(raw);
b2197755 DB	55	break;
	56	default:
	57	WARN_ON_ONCE(1);
	58	break;
	59	}
	60	}
	61
	62	static void bpf_fd_probe_obj(u32 ufd, enum bpf_type type)
	63	{
	64	void *raw;
	65
	66	*type = BPF_TYPE_MAP;
c9da161c	67	raw = bpf_map_get_with_uref(ufd);
b2197755 DB	68	if (IS_ERR(raw)) {
	69	*type = BPF_TYPE_PROG;
	70	raw = bpf_prog_get(ufd);
	71	}
	72
	73	return raw;
	74	}
	75
	76	static const struct inode_operations bpf_dir_iops;
	77
	78	static const struct inode_operations bpf_prog_iops = { };
	79	static const struct inode_operations bpf_map_iops = { };
	80
	81	static struct inode bpf_get_inode(struct super_block sb,
	82	const struct inode *dir,
	83	umode_t mode)
	84	{
	85	struct inode *inode;
	86
	87	switch (mode & S_IFMT) {
	88	case S_IFDIR:
	89	case S_IFREG:
0f98621b	90	case S_IFLNK:
b2197755 DB	91	break;
	92	default:
	93	return ERR_PTR(-EINVAL);
	94	}
	95
	96	inode = new_inode(sb);
	97	if (!inode)
	98	return ERR_PTR(-ENOSPC);
	99
	100	inode->i_ino = get_next_ino();
078cd827	101	inode->i_atime = current_time(inode);
b2197755 DB	102	inode->i_mtime = inode->i_atime;
	103	inode->i_ctime = inode->i_atime;
	104
	105	inode_init_owner(inode, dir, mode);
	106
	107	return inode;
	108	}
	109
	110	static int bpf_inode_type(const struct inode inode, enum bpf_type type)
	111	{
	112	*type = BPF_TYPE_UNSPEC;
	113	if (inode->i_op == &bpf_prog_iops)
	114	*type = BPF_TYPE_PROG;
	115	else if (inode->i_op == &bpf_map_iops)
	116	*type = BPF_TYPE_MAP;
	117	else
	118	return -EACCES;
	119
	120	return 0;
	121	}
	122
0f98621b DB	123	static void bpf_dentry_finalize(struct dentry dentry, struct inode inode,
	124	struct inode *dir)
	125	{
	126	d_instantiate(dentry, inode);
	127	dget(dentry);
	128
	129	dir->i_mtime = current_time(dir);
	130	dir->i_ctime = dir->i_mtime;
	131	}
	132
b2197755 DB	133	static int bpf_mkdir(struct inode dir, struct dentry dentry, umode_t mode)
	134	{
	135	struct inode *inode;
	136
b2197755 DB	137	inode = bpf_get_inode(dir->i_sb, dir, mode \| S_IFDIR);
	138	if (IS_ERR(inode))
	139	return PTR_ERR(inode);
	140
	141	inode->i_op = &bpf_dir_iops;
	142	inode->i_fop = &simple_dir_operations;
	143
	144	inc_nlink(inode);
	145	inc_nlink(dir);
	146
0f98621b	147	bpf_dentry_finalize(dentry, inode, dir);
b2197755 DB	148	return 0;
	149	}
	150
a26ca7c9 MKL	151	struct map_iter {
	152	void *key;
	153	bool done;
	154	};
	155
	156	static struct map_iter map_iter(struct seq_file m)
	157	{
	158	return m->private;
	159	}
	160
	161	static struct bpf_map seq_file_to_map(struct seq_file m)
	162	{
	163	return file_inode(m->file)->i_private;
	164	}
	165
	166	static void map_iter_free(struct map_iter *iter)
	167	{
	168	if (iter) {
	169	kfree(iter->key);
	170	kfree(iter);
	171	}
	172	}
	173
	174	static struct map_iter map_iter_alloc(struct bpf_map map)
	175	{
	176	struct map_iter *iter;
	177
	178	iter = kzalloc(sizeof(*iter), GFP_KERNEL \| __GFP_NOWARN);
	179	if (!iter)
	180	goto error;
	181
	182	iter->key = kzalloc(map->key_size, GFP_KERNEL \| __GFP_NOWARN);
	183	if (!iter->key)
	184	goto error;
	185
	186	return iter;
	187
	188	error:
	189	map_iter_free(iter);
	190	return NULL;
	191	}
	192
	193	static void map_seq_next(struct seq_file m, void v, loff_t pos)
	194	{
	195	struct bpf_map *map = seq_file_to_map(m);
	196	void *key = map_iter(m)->key;
dc1508a5	197	void *prev_key;
a26ca7c9 MKL	198
	199	if (map_iter(m)->done)
	200	return NULL;
	201
	202	if (unlikely(v == SEQ_START_TOKEN))
dc1508a5 YS	203	prev_key = NULL;
	204	else
	205	prev_key = key;
a26ca7c9	206
dc1508a5	207	if (map->ops->map_get_next_key(map, prev_key, key)) {
a26ca7c9 MKL	208	map_iter(m)->done = true;
	209	return NULL;
	210	}
	211
a26ca7c9 MKL	212	++(*pos);
	213	return key;
	214	}
	215
	216	static void map_seq_start(struct seq_file m, loff_t *pos)
	217	{
	218	if (map_iter(m)->done)
	219	return NULL;
	220
	221	return *pos ? map_iter(m)->key : SEQ_START_TOKEN;
	222	}
	223
	224	static void map_seq_stop(struct seq_file m, void v)
	225	{
	226	}
	227
	228	static int map_seq_show(struct seq_file m, void v)
	229	{
	230	struct bpf_map *map = seq_file_to_map(m);
	231	void *key = map_iter(m)->key;
	232
	233	if (unlikely(v == SEQ_START_TOKEN)) {
	234	seq_puts(m, "# WARNING!! The output is for debug purpose only\n");
	235	seq_puts(m, "# WARNING!! The output format will change\n");
	236	} else {
	237	map->ops->map_seq_show_elem(map, key, m);
	238	}
	239
	240	return 0;
	241	}
	242
	243	static const struct seq_operations bpffs_map_seq_ops = {
	244	.start = map_seq_start,
	245	.next = map_seq_next,
	246	.show = map_seq_show,
	247	.stop = map_seq_stop,
	248	};
	249
	250	static int bpffs_map_open(struct inode inode, struct file file)
	251	{
	252	struct bpf_map *map = inode->i_private;
	253	struct map_iter *iter;
	254	struct seq_file *m;
	255	int err;
	256
	257	iter = map_iter_alloc(map);
	258	if (!iter)
	259	return -ENOMEM;
	260
	261	err = seq_open(file, &bpffs_map_seq_ops);
	262	if (err) {
	263	map_iter_free(iter);
	264	return err;
	265	}
	266
	267	m = file->private_data;
	268	m->private = iter;
	269
	270	return 0;
	271	}
	272
	273	static int bpffs_map_release(struct inode inode, struct file file)
	274	{
	275	struct seq_file *m = file->private_data;
276
277	map_iter_free(map_iter(m));
278
279	return seq_release(inode, file);
280	}
281
282	/* bpffs_map_fops should only implement the basic
283	* read operation for a BPF map. The purpose is to
284	* provide a simple user intuitive way to do
285	* "cat bpffs/pathto/a-pinned-map".
286	*
287	* Other operations (e.g. write, lookup...) should be realized by
288	* the userspace tools (e.g. bpftool) through the
289	* BPF_OBJ_GET_INFO_BY_FD and the map's lookup/update
290	* interface.
291	*/
292	static const struct file_operations bpffs_map_fops = {
293	.open = bpffs_map_open,
294	.read = seq_read,
295	.release = bpffs_map_release,
296	};
297
b1655857 DB	298	static int bpffs_obj_open(struct inode inode, struct file file)
	299	{
	300	return -EIO;
	301	}
	302
	303	static const struct file_operations bpffs_obj_fops = {
	304	.open = bpffs_obj_open,
	305	};
	306
a4a0683f	307	static int bpf_mkobj_ops(struct dentry dentry, umode_t mode, void raw,
a26ca7c9 MKL	308	const struct inode_operations *iops,
a26ca7c9 MKL	309	const struct file_operations *fops)
b2197755	310	{
a4a0683f AV	311	struct inode *dir = dentry->d_parent->d_inode;
a4a0683f AV	312	struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode);
b2197755 DB	313	if (IS_ERR(inode))
	314	return PTR_ERR(inode);
	315
	316	inode->i_op = iops;
a26ca7c9	317	inode->i_fop = fops;
a4a0683f	318	inode->i_private = raw;
b2197755	319
0f98621b	320	bpf_dentry_finalize(dentry, inode, dir);
b2197755 DB	321	return 0;
	322	}
	323
a4a0683f	324	static int bpf_mkprog(struct dentry dentry, umode_t mode, void arg)
b2197755	325	{
b1655857 DB	326	return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops,
b1655857 DB	327	&bpffs_obj_fops);
a4a0683f	328	}
b2197755	329
a4a0683f AV	330	static int bpf_mkmap(struct dentry dentry, umode_t mode, void arg)
a4a0683f AV	331	{
a26ca7c9 MKL	332	struct bpf_map *map = arg;
	333
	334	return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops,
e8d2bec0 DB	335	bpf_map_support_seq_show(map) ?
e8d2bec0 DB	336	&bpffs_map_fops : &bpffs_obj_fops);
b2197755 DB	337	}
b2197755 DB	338
0c93b7d8 AV	339	static struct dentry *
0c93b7d8 AV	340	bpf_lookup(struct inode dir, struct dentry dentry, unsigned flags)
bb35a6ef	341	{
6d8cb045 QM	342	/* Dots in names (e.g. "/sys/fs/bpf/foo.bar") are reserved for future
	343	* extensions.
	344	*/
0c93b7d8 AV	345	if (strchr(dentry->d_name.name, '.'))
0c93b7d8 AV	346	return ERR_PTR(-EPERM);
0f98621b	347
0c93b7d8	348	return simple_lookup(dir, dentry, flags);
bb35a6ef DB	349	}
bb35a6ef DB	350
0f98621b DB	351	static int bpf_symlink(struct inode dir, struct dentry dentry,
	352	const char *target)
	353	{
	354	char *link = kstrdup(target, GFP_USER \| __GFP_NOWARN);
	355	struct inode *inode;
	356
	357	if (!link)
	358	return -ENOMEM;
	359
	360	inode = bpf_get_inode(dir->i_sb, dir, S_IRWXUGO \| S_IFLNK);
	361	if (IS_ERR(inode)) {
	362	kfree(link);
	363	return PTR_ERR(inode);
	364	}
	365
	366	inode->i_op = &simple_symlink_inode_operations;
	367	inode->i_link = link;
	368
	369	bpf_dentry_finalize(dentry, inode, dir);
	370	return 0;
	371	}
	372
b2197755	373	static const struct inode_operations bpf_dir_iops = {
0c93b7d8	374	.lookup = bpf_lookup,
b2197755	375	.mkdir = bpf_mkdir,
0f98621b	376	.symlink = bpf_symlink,
b2197755	377	.rmdir = simple_rmdir,
0c93b7d8 AV	378	.rename = simple_rename,
0c93b7d8 AV	379	.link = simple_link,
b2197755 DB	380	.unlink = simple_unlink,
	381	};
	382
	383	static int bpf_obj_do_pin(const struct filename pathname, void raw,
	384	enum bpf_type type)
	385	{
	386	struct dentry *dentry;
	387	struct inode *dir;
	388	struct path path;
	389	umode_t mode;
b2197755 DB	390	int ret;
	391
	392	dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0);
	393	if (IS_ERR(dentry))
	394	return PTR_ERR(dentry);
	395
	396	mode = S_IFREG \| ((S_IRUSR \| S_IWUSR) & ~current_umask());
b2197755	397
a4a0683f	398	ret = security_path_mknod(&path, dentry, mode, 0);
b2197755 DB	399	if (ret)
	400	goto out;
	401
	402	dir = d_inode(path.dentry);
	403	if (dir->i_op != &bpf_dir_iops) {
	404	ret = -EPERM;
	405	goto out;
	406	}
	407
a4a0683f AV	408	switch (type) {
	409	case BPF_TYPE_PROG:
	410	ret = vfs_mkobj(dentry, mode, bpf_mkprog, raw);
	411	break;
	412	case BPF_TYPE_MAP:
	413	ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw);
	414	break;
	415	default:
	416	ret = -EPERM;
	417	}
b2197755 DB	418	out:
	419	done_path_create(&path, dentry);
	420	return ret;
	421	}
	422
	423	int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
	424	{
	425	struct filename *pname;
	426	enum bpf_type type;
	427	void *raw;
	428	int ret;
	429
	430	pname = getname(pathname);
	431	if (IS_ERR(pname))
	432	return PTR_ERR(pname);
	433
	434	raw = bpf_fd_probe_obj(ufd, &type);
	435	if (IS_ERR(raw)) {
	436	ret = PTR_ERR(raw);
	437	goto out;
	438	}
	439
	440	ret = bpf_obj_do_pin(pname, raw, type);
	441	if (ret != 0)
	442	bpf_any_put(raw, type);
	443	out:
	444	putname(pname);
	445	return ret;
	446	}
	447
	448	static void bpf_obj_do_get(const struct filename pathname,
6e71b04a	449	enum bpf_type *type, int flags)
b2197755 DB	450	{
	451	struct inode *inode;
	452	struct path path;
	453	void *raw;
	454	int ret;
	455
	456	ret = kern_path(pathname->name, LOOKUP_FOLLOW, &path);
	457	if (ret)
	458	return ERR_PTR(ret);
	459
	460	inode = d_backing_inode(path.dentry);
6e71b04a	461	ret = inode_permission(inode, ACC_MODE(flags));
b2197755 DB	462	if (ret)
	463	goto out;
	464
	465	ret = bpf_inode_type(inode, type);
	466	if (ret)
	467	goto out;
	468
	469	raw = bpf_any_get(inode->i_private, *type);
92117d84 AS	470	if (!IS_ERR(raw))
92117d84 AS	471	touch_atime(&path);
b2197755 DB	472
	473	path_put(&path);
	474	return raw;
	475	out:
	476	path_put(&path);
	477	return ERR_PTR(ret);
	478	}
	479
6e71b04a	480	int bpf_obj_get_user(const char __user *pathname, int flags)
b2197755 DB	481	{
	482	enum bpf_type type = BPF_TYPE_UNSPEC;
	483	struct filename *pname;
	484	int ret = -ENOENT;
6e71b04a	485	int f_flags;
b2197755 DB	486	void *raw;
b2197755 DB	487
6e71b04a CF	488	f_flags = bpf_get_file_flag(flags);
	489	if (f_flags < 0)
	490	return f_flags;
	491
b2197755 DB	492	pname = getname(pathname);
	493	if (IS_ERR(pname))
	494	return PTR_ERR(pname);
	495
6e71b04a	496	raw = bpf_obj_do_get(pname, &type, f_flags);
b2197755 DB	497	if (IS_ERR(raw)) {
	498	ret = PTR_ERR(raw);
	499	goto out;
	500	}
	501
	502	if (type == BPF_TYPE_PROG)
	503	ret = bpf_prog_new_fd(raw);
	504	else if (type == BPF_TYPE_MAP)
6e71b04a	505	ret = bpf_map_new_fd(raw, f_flags);
b2197755 DB	506	else
	507	goto out;
	508
4d220ed0	509	if (ret < 0)
b2197755 DB	510	bpf_any_put(raw, type);
	511	out:
	512	putname(pname);
	513	return ret;
	514	}
040ee692 AV	515
	516	static struct bpf_prog __get_prog_inode(struct inode inode, enum bpf_prog_type type)
	517	{
	518	struct bpf_prog *prog;
e547ff3f	519	int ret = inode_permission(inode, MAY_READ);
040ee692 AV	520	if (ret)
	521	return ERR_PTR(ret);
	522
	523	if (inode->i_op == &bpf_map_iops)
	524	return ERR_PTR(-EINVAL);
	525	if (inode->i_op != &bpf_prog_iops)
	526	return ERR_PTR(-EACCES);
	527
	528	prog = inode->i_private;
	529
	530	ret = security_bpf_prog(prog);
	531	if (ret < 0)
	532	return ERR_PTR(ret);
	533
	534	if (!bpf_prog_get_ok(prog, &type, false))
	535	return ERR_PTR(-EINVAL);
	536
85192dbf AN	537	bpf_prog_inc(prog);
85192dbf AN	538	return prog;
040ee692 AV	539	}
	540
	541	struct bpf_prog bpf_prog_get_type_path(const char name, enum bpf_prog_type type)
	542	{
	543	struct bpf_prog *prog;
	544	struct path path;
	545	int ret = kern_path(name, LOOKUP_FOLLOW, &path);
	546	if (ret)
	547	return ERR_PTR(ret);
	548	prog = __get_prog_inode(d_backing_inode(path.dentry), type);
	549	if (!IS_ERR(prog))
	550	touch_atime(&path);
	551	path_put(&path);
	552	return prog;
	553	}
	554	EXPORT_SYMBOL(bpf_prog_get_type_path);
b2197755	555
4cc7c186 DH	556	/*
	557	* Display the mount options in /proc/mounts.
	558	*/
	559	static int bpf_show_options(struct seq_file m, struct dentry root)
	560	{
	561	umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX;
	562
	563	if (mode != S_IRWXUGO)
	564	seq_printf(m, ",mode=%o", mode);
	565	return 0;
	566	}
	567
524845ff	568	static void bpf_free_inode(struct inode *inode)
1da6c4d9	569	{
1da6c4d9 DB	570	enum bpf_type type;
	571
	572	if (S_ISLNK(inode->i_mode))
	573	kfree(inode->i_link);
	574	if (!bpf_inode_type(inode, &type))
	575	bpf_any_put(inode->i_private, type);
	576	free_inode_nonrcu(inode);
	577	}
	578
b2197755 DB	579	static const struct super_operations bpf_super_ops = {
	580	.statfs = simple_statfs,
	581	.drop_inode = generic_delete_inode,
4cc7c186	582	.show_options = bpf_show_options,
524845ff	583	.free_inode = bpf_free_inode,
b2197755 DB	584	};
b2197755 DB	585
a3af5f80 DB	586	enum {
a3af5f80 DB	587	OPT_MODE,
a3af5f80 DB	588	};
a3af5f80 DB	589
d2935de7 DH	590	static const struct fs_parameter_spec bpf_param_specs[] = {
	591	fsparam_u32oct ("mode", OPT_MODE),
	592	{}
	593	};
	594
	595	static const struct fs_parameter_description bpf_fs_parameters = {
	596	.name = "bpf",
	597	.specs = bpf_param_specs,
a3af5f80 DB	598	};
	599
	600	struct bpf_mount_opts {
	601	umode_t mode;
	602	};
	603
d2935de7	604	static int bpf_parse_param(struct fs_context fc, struct fs_parameter param)
a3af5f80	605	{
d2935de7 DH	606	struct bpf_mount_opts *opts = fc->fs_private;
	607	struct fs_parse_result result;
	608	int opt;
a3af5f80	609
d2935de7 DH	610	opt = fs_parse(fc, &bpf_fs_parameters, param, &result);
d2935de7 DH	611	if (opt < 0)
a3af5f80 DB	612	/* We might like to report bad mount options here, but
	613	* traditionally we've ignored all mount options, so we'd
	614	* better continue to ignore non-existing options for bpf.
	615	*/
d2935de7 DH	616	return opt == -ENOPARAM ? 0 : opt;
	617
	618	switch (opt) {
	619	case OPT_MODE:
	620	opts->mode = result.uint_32 & S_IALLUGO;
	621	break;
a3af5f80 DB	622	}
	623
	624	return 0;
	625	}
	626
d2935de7	627	static int bpf_fill_super(struct super_block sb, struct fs_context fc)
b2197755	628	{
cda37124	629	static const struct tree_descr bpf_rfiles[] = { { "" } };
d2935de7	630	struct bpf_mount_opts *opts = fc->fs_private;
b2197755 DB	631	struct inode *inode;
	632	int ret;
	633
	634	ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);
	635	if (ret)
	636	return ret;
	637
	638	sb->s_op = &bpf_super_ops;
	639
	640	inode = sb->s_root->d_inode;
	641	inode->i_op = &bpf_dir_iops;
	642	inode->i_mode &= ~S_IALLUGO;
d2935de7	643	inode->i_mode \|= S_ISVTX \| opts->mode;
b2197755 DB	644
	645	return 0;
	646	}
	647
d2935de7 DH	648	static int bpf_get_tree(struct fs_context *fc)
	649	{
	650	return get_tree_nodev(fc, bpf_fill_super);
	651	}
	652
	653	static void bpf_free_fc(struct fs_context *fc)
b2197755	654	{
d2935de7 DH	655	kfree(fc->fs_private);
	656	}
	657
	658	static const struct fs_context_operations bpf_context_ops = {
	659	.free = bpf_free_fc,
	660	.parse_param = bpf_parse_param,
	661	.get_tree = bpf_get_tree,
	662	};
	663
	664	/*
	665	* Set up the filesystem mount context.
	666	*/
	667	static int bpf_init_fs_context(struct fs_context *fc)
	668	{
	669	struct bpf_mount_opts *opts;
	670
	671	opts = kzalloc(sizeof(struct bpf_mount_opts), GFP_KERNEL);
	672	if (!opts)
	673	return -ENOMEM;
	674
	675	opts->mode = S_IRWXUGO;
	676
	677	fc->fs_private = opts;
	678	fc->ops = &bpf_context_ops;
	679	return 0;
b2197755 DB	680	}
	681
	682	static struct file_system_type bpf_fs_type = {
	683	.owner = THIS_MODULE,
	684	.name = "bpf",
d2935de7 DH	685	.init_fs_context = bpf_init_fs_context,
d2935de7 DH	686	.parameters = &bpf_fs_parameters,
b2197755	687	.kill_sb = kill_litter_super,
b2197755 DB	688	};
b2197755 DB	689
b2197755 DB	690	static int __init bpf_init(void)
	691	{
	692	int ret;
	693
	694	ret = sysfs_create_mount_point(fs_kobj, "bpf");
	695	if (ret)
	696	return ret;
	697
	698	ret = register_filesystem(&bpf_fs_type);
	699	if (ret)
	700	sysfs_remove_mount_point(fs_kobj, "bpf");
	701
	702	return ret;
	703	}
	704	fs_initcall(bpf_init);