[mirror_ubuntu-bionic-kernel.git] / fs / proc / generic.c

/*
 * proc/fs/generic.c --- generic routines for the proc-fs
 *
 * This file contains generic proc-fs routines for handling
 * directories and files.
 * 
 * Copyright (C) 1991, 1992 Linus Torvalds.
 * Copyright (C) 1997 Theodore Ts'o
 */

#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
#include <linux/stat.h>
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/smp_lock.h>
#include <linux/init.h>
#include <linux/idr.h>
#include <linux/namei.h>
#include <linux/bitops.h>
#include <linux/spinlock.h>
#include <linux/completion.h>
#include <asm/uaccess.h>

#include "internal.h"

static ssize_t proc_file_read(struct file *file, char __user *buf,
			      size_t nbytes, loff_t *ppos);
static ssize_t proc_file_write(struct file *file, const char __user *buffer,
			       size_t count, loff_t *ppos);
static loff_t proc_file_lseek(struct file *, loff_t, int);

DEFINE_SPINLOCK(proc_subdir_lock);

static int proc_match(int len, const char *name, struct proc_dir_entry *de)
{
	if (de->namelen != len)
		return 0;
	return !memcmp(name, de->name, len);
}

static const struct file_operations proc_file_operations = {
	.llseek		= proc_file_lseek,
	.read		= proc_file_read,
	.write		= proc_file_write,
};

/* buffer size is one page but our output routines use some slack for overruns */
#define PROC_BLOCK_SIZE	(PAGE_SIZE - 1024)

static ssize_t
proc_file_read(struct file *file, char __user *buf, size_t nbytes,
	       loff_t *ppos)
{
	struct inode * inode = file->f_path.dentry->d_inode;
	char 	*page;
	ssize_t	retval=0;
	int	eof=0;
	ssize_t	n, count;
	char	*start;
	struct proc_dir_entry * dp;
	unsigned long long pos;

	/*
	 * Gaah, please just use "seq_file" instead. The legacy /proc
	 * interfaces cut loff_t down to off_t for reads, and ignore
	 * the offset entirely for writes..
	 */
	pos = *ppos;
	if (pos > MAX_NON_LFS)
		return 0;
	if (nbytes > MAX_NON_LFS - pos)
		nbytes = MAX_NON_LFS - pos;

	dp = PDE(inode);
	if (!(page = (char*) __get_free_page(GFP_TEMPORARY)))
		return -ENOMEM;

	while ((nbytes > 0) && !eof) {
		count = min_t(size_t, PROC_BLOCK_SIZE, nbytes);

		start = NULL;
		if (dp->get_info) {
			/* Handle old net routines */
			n = dp->get_info(page, &start, *ppos, count);
			if (n < count)
				eof = 1;
		} else if (dp->read_proc) {
			/*
			 * How to be a proc read function
			 * ------------------------------
			 * Prototype:
			 *    int f(char *buffer, char **start, off_t offset,
			 *          int count, int *peof, void *dat)
			 *
			 * Assume that the buffer is "count" bytes in size.
			 *
			 * If you know you have supplied all the data you
			 * have, set *peof.
			 *
			 * You have three ways to return data:
			 * 0) Leave *start = NULL.  (This is the default.)
			 *    Put the data of the requested offset at that
			 *    offset within the buffer.  Return the number (n)
			 *    of bytes there are from the beginning of the
			 *    buffer up to the last byte of data.  If the
			 *    number of supplied bytes (= n - offset) is 
			 *    greater than zero and you didn't signal eof
			 *    and the reader is prepared to take more data
			 *    you will be called again with the requested
			 *    offset advanced by the number of bytes 
			 *    absorbed.  This interface is useful for files
			 *    no larger than the buffer.
			 * 1) Set *start = an unsigned long value less than
			 *    the buffer address but greater than zero.
			 *    Put the data of the requested offset at the
			 *    beginning of the buffer.  Return the number of
			 *    bytes of data placed there.  If this number is
			 *    greater than zero and you didn't signal eof
			 *    and the reader is prepared to take more data
			 *    you will be called again with the requested
			 *    offset advanced by *start.  This interface is
			 *    useful when you have a large file consisting
			 *    of a series of blocks which you want to count
			 *    and return as wholes.
			 *    (Hack by Paul.Russell@rustcorp.com.au)
			 * 2) Set *start = an address within the buffer.
			 *    Put the data of the requested offset at *start.
			 *    Return the number of bytes of data placed there.
			 *    If this number is greater than zero and you
			 *    didn't signal eof and the reader is prepared to
			 *    take more data you will be called again with the
			 *    requested offset advanced by the number of bytes
			 *    absorbed.
			 */
			n = dp->read_proc(page, &start, *ppos,
					  count, &eof, dp->data);
		} else
			break;

		if (n == 0)   /* end of file */
			break;
		if (n < 0) {  /* error */
			if (retval == 0)
				retval = n;
			break;
		}

		if (start == NULL) {
			if (n > PAGE_SIZE) {
				printk(KERN_ERR
				       "proc_file_read: Apparent buffer overflow!\n");
				n = PAGE_SIZE;
			}
			n -= *ppos;
			if (n <= 0)
				break;
			if (n > count)
				n = count;
			start = page + *ppos;
		} else if (start < page) {
			if (n > PAGE_SIZE) {
				printk(KERN_ERR
				       "proc_file_read: Apparent buffer overflow!\n");
				n = PAGE_SIZE;
			}
			if (n > count) {
				/*
				 * Don't reduce n because doing so might
				 * cut off part of a data block.
				 */
				printk(KERN_WARNING
				       "proc_file_read: Read count exceeded\n");
			}
		} else /* start >= page */ {
			unsigned long startoff = (unsigned long)(start - page);
			if (n > (PAGE_SIZE - startoff)) {
				printk(KERN_ERR
				       "proc_file_read: Apparent buffer overflow!\n");
				n = PAGE_SIZE - startoff;
			}
			if (n > count)
				n = count;
		}
		
 		n -= copy_to_user(buf, start < page ? page : start, n);
		if (n == 0) {
			if (retval == 0)
				retval = -EFAULT;
			break;
		}

		*ppos += start < page ? (unsigned long)start : n;
		nbytes -= n;
		buf += n;
		retval += n;
	}
	free_page((unsigned long) page);
	return retval;
}

static ssize_t
proc_file_write(struct file *file, const char __user *buffer,
		size_t count, loff_t *ppos)
{
	struct inode *inode = file->f_path.dentry->d_inode;
	struct proc_dir_entry * dp;
	
	dp = PDE(inode);

	if (!dp->write_proc)
		return -EIO;

	/* FIXME: does this routine need ppos?  probably... */
	return dp->write_proc(file, buffer, count, dp->data);
}


static loff_t
proc_file_lseek(struct file *file, loff_t offset, int orig)
{
	loff_t retval = -EINVAL;
	switch (orig) {
	case 1:
		offset += file->f_pos;
	/* fallthrough */
	case 0:
		if (offset < 0 || offset > MAX_NON_LFS)
			break;
		file->f_pos = retval = offset;
	}
	return retval;
}

static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
{
	struct inode *inode = dentry->d_inode;
	struct proc_dir_entry *de = PDE(inode);
	int error;

	error = inode_change_ok(inode, iattr);
	if (error)
		goto out;

	error = inode_setattr(inode, iattr);
	if (error)
		goto out;
	
	de->uid = inode->i_uid;
	de->gid = inode->i_gid;
	de->mode = inode->i_mode;
out:
	return error;
}

static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry,
			struct kstat *stat)
{
	struct inode *inode = dentry->d_inode;
	struct proc_dir_entry *de = PROC_I(inode)->pde;
	if (de && de->nlink)
		inode->i_nlink = de->nlink;

	generic_fillattr(inode, stat);
	return 0;
}

static const struct inode_operations proc_file_inode_operations = {
	.setattr	= proc_notify_change,
};

/*
 * This function parses a name such as "tty/driver/serial", and
 * returns the struct proc_dir_entry for "/proc/tty/driver", and
 * returns "serial" in residual.
 */
static int xlate_proc_name(const char *name,
			   struct proc_dir_entry **ret, const char **residual)
{
	const char     		*cp = name, *next;
	struct proc_dir_entry	*de;
	int			len;
	int 			rtn = 0;

	spin_lock(&proc_subdir_lock);
	de = &proc_root;
	while (1) {
		next = strchr(cp, '/');
		if (!next)
			break;

		len = next - cp;
		for (de = de->subdir; de ; de = de->next) {
			if (proc_match(len, cp, de))
				break;
		}
		if (!de) {
			rtn = -ENOENT;
			goto out;
		}
		cp += len + 1;
	}
	*residual = cp;
	*ret = de;
out:
	spin_unlock(&proc_subdir_lock);
	return rtn;
}

static DEFINE_IDR(proc_inum_idr);
static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */

#define PROC_DYNAMIC_FIRST 0xF0000000UL

/*
 * Return an inode number between PROC_DYNAMIC_FIRST and
 * 0xffffffff, or zero on failure.
 */
static unsigned int get_inode_number(void)
{
	int i, inum = 0;
	int error;

retry:
	if (idr_pre_get(&proc_inum_idr, GFP_KERNEL) == 0)
		return 0;

	spin_lock(&proc_inum_lock);
	error = idr_get_new(&proc_inum_idr, NULL, &i);
	spin_unlock(&proc_inum_lock);
	if (error == -EAGAIN)
		goto retry;
	else if (error)
		return 0;

	inum = (i & MAX_ID_MASK) + PROC_DYNAMIC_FIRST;

	/* inum will never be more than 0xf0ffffff, so no check
	 * for overflow.
	 */

	return inum;
}

static void release_inode_number(unsigned int inum)
{
	int id = (inum - PROC_DYNAMIC_FIRST) | ~MAX_ID_MASK;

	spin_lock(&proc_inum_lock);
	idr_remove(&proc_inum_idr, id);
	spin_unlock(&proc_inum_lock);
}

static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd)
{
	nd_set_link(nd, PDE(dentry->d_inode)->data);
	return NULL;
}

static const struct inode_operations proc_link_inode_operations = {
	.readlink	= generic_readlink,
	.follow_link	= proc_follow_link,
};

/*
 * As some entries in /proc are volatile, we want to 
 * get rid of unused dentries.  This could be made 
 * smarter: we could keep a "volatile" flag in the 
 * inode to indicate which ones to keep.
 */
static int proc_delete_dentry(struct dentry * dentry)
{
	return 1;
}

static int proc_revalidate_dentry(struct dentry *dentry, struct nameidata *nd)
{
	d_drop(dentry);
	return 0;
}

static struct dentry_operations proc_dentry_operations =
{
	.d_delete	= proc_delete_dentry,
	.d_revalidate	= proc_revalidate_dentry,
};

/*
 * Don't create negative dentries here, return -ENOENT by hand
 * instead.
 */
struct dentry *proc_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
{
	struct inode *inode = NULL;
	struct proc_dir_entry * de;
	int error = -ENOENT;

	lock_kernel();
	spin_lock(&proc_subdir_lock);
	de = PDE(dir);
	if (de) {
		for (de = de->subdir; de ; de = de->next) {
			if (de->namelen != dentry->d_name.len)
				continue;
			if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
				unsigned int ino;

				if (de->shadow_proc)
					de = de->shadow_proc(current, de);
				ino = de->low_ino;
				de_get(de);
				spin_unlock(&proc_subdir_lock);
				error = -EINVAL;
				inode = proc_get_inode(dir->i_sb, ino, de);
				spin_lock(&proc_subdir_lock);
				break;
			}
		}
	}
	spin_unlock(&proc_subdir_lock);
	unlock_kernel();

	if (inode) {
		dentry->d_op = &proc_dentry_operations;
		d_add(dentry, inode);
		return NULL;
	}
	de_put(de);
	return ERR_PTR(error);
}

/*
 * This returns non-zero if at EOF, so that the /proc
 * root directory can use this and check if it should
 * continue with the <pid> entries..
 *
 * Note that the VFS-layer doesn't care about the return
 * value of the readdir() call, as long as it's non-negative
 * for success..
 */
int proc_readdir(struct file * filp,
	void * dirent, filldir_t filldir)
{
	struct proc_dir_entry * de;
	unsigned int ino;
	int i;
	struct inode *inode = filp->f_path.dentry->d_inode;
	int ret = 0;

	lock_kernel();

	ino = inode->i_ino;
	de = PDE(inode);
	if (!de) {
		ret = -EINVAL;
		goto out;
	}
	i = filp->f_pos;
	switch (i) {
		case 0:
			if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
				goto out;
			i++;
			filp->f_pos++;
			/* fall through */
		case 1:
			if (filldir(dirent, "..", 2, i,
				    parent_ino(filp->f_path.dentry),
				    DT_DIR) < 0)
				goto out;
			i++;
			filp->f_pos++;
			/* fall through */
		default:
			spin_lock(&proc_subdir_lock);
			de = de->subdir;
			i -= 2;
			for (;;) {
				if (!de) {
					ret = 1;
					spin_unlock(&proc_subdir_lock);
					goto out;
				}
				if (!i)
					break;
				de = de->next;
				i--;
			}

			do {
				struct proc_dir_entry *next;

				/* filldir passes info to user space */
				de_get(de);
				spin_unlock(&proc_subdir_lock);
				if (filldir(dirent, de->name, de->namelen, filp->f_pos,
					    de->low_ino, de->mode >> 12) < 0) {
					de_put(de);
					goto out;
				}
				spin_lock(&proc_subdir_lock);
				filp->f_pos++;
				next = de->next;
				de_put(de);
				de = next;
			} while (de);
			spin_unlock(&proc_subdir_lock);
	}
	ret = 1;
out:	unlock_kernel();
	return ret;	
}

/*
 * These are the generic /proc directory operations. They
 * use the in-memory "struct proc_dir_entry" tree to parse
 * the /proc directory.
 */
static const struct file_operations proc_dir_operations = {
	.read			= generic_read_dir,
	.readdir		= proc_readdir,
};

/*
 * proc directories can do almost nothing..
 */
static const struct inode_operations proc_dir_inode_operations = {
	.lookup		= proc_lookup,
	.getattr	= proc_getattr,
	.setattr	= proc_notify_change,
};

static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
{
	unsigned int i;
	
	i = get_inode_number();
	if (i == 0)
		return -EAGAIN;
	dp->low_ino = i;

	if (S_ISDIR(dp->mode)) {
		if (dp->proc_iops == NULL) {
			dp->proc_fops = &proc_dir_operations;
			dp->proc_iops = &proc_dir_inode_operations;
		}
		dir->nlink++;
	} else if (S_ISLNK(dp->mode)) {
		if (dp->proc_iops == NULL)
			dp->proc_iops = &proc_link_inode_operations;
	} else if (S_ISREG(dp->mode)) {
		if (dp->proc_fops == NULL)
			dp->proc_fops = &proc_file_operations;
		if (dp->proc_iops == NULL)
			dp->proc_iops = &proc_file_inode_operations;
	}

	spin_lock(&proc_subdir_lock);
	dp->next = dir->subdir;
	dp->parent = dir;
	dir->subdir = dp;
	spin_unlock(&proc_subdir_lock);

	return 0;
}

static struct proc_dir_entry *proc_create(struct proc_dir_entry **parent,
					  const char *name,
					  mode_t mode,
					  nlink_t nlink)
{
	struct proc_dir_entry *ent = NULL;
	const char *fn = name;
	int len;

	/* make sure name is valid */
	if (!name || !strlen(name)) goto out;

	if (!(*parent) && xlate_proc_name(name, parent, &fn) != 0)
		goto out;

	/* At this point there must not be any '/' characters beyond *fn */
	if (strchr(fn, '/'))
		goto out;

	len = strlen(fn);

	ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL);
	if (!ent) goto out;

	memset(ent, 0, sizeof(struct proc_dir_entry));
	memcpy(((char *) ent) + sizeof(struct proc_dir_entry), fn, len + 1);
	ent->name = ((char *) ent) + sizeof(*ent);
	ent->namelen = len;
	ent->mode = mode;
	ent->nlink = nlink;
	atomic_set(&ent->count, 1);
	ent->pde_users = 0;
	spin_lock_init(&ent->pde_unload_lock);
	ent->pde_unload_completion = NULL;
 out:
	return ent;
}

struct proc_dir_entry *proc_symlink(const char *name,
		struct proc_dir_entry *parent, const char *dest)
{
	struct proc_dir_entry *ent;

	ent = proc_create(&parent,name,
			  (S_IFLNK | S_IRUGO | S_IWUGO | S_IXUGO),1);

	if (ent) {
		ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL);
		if (ent->data) {
			strcpy((char*)ent->data,dest);
			if (proc_register(parent, ent) < 0) {
				kfree(ent->data);
				kfree(ent);
				ent = NULL;
			}
		} else {
			kfree(ent);
			ent = NULL;
		}
	}
	return ent;
}

struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode,
		struct proc_dir_entry *parent)
{
	struct proc_dir_entry *ent;

	ent = proc_create(&parent, name, S_IFDIR | mode, 2);
	if (ent) {
		if (proc_register(parent, ent) < 0) {
			kfree(ent);
			ent = NULL;
		}
	}
	return ent;
}

struct proc_dir_entry *proc_mkdir(const char *name,
		struct proc_dir_entry *parent)
{
	return proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent);
}

struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
					 struct proc_dir_entry *parent)
{
	struct proc_dir_entry *ent;
	nlink_t nlink;

	if (S_ISDIR(mode)) {
		if ((mode & S_IALLUGO) == 0)
			mode |= S_IRUGO | S_IXUGO;
		nlink = 2;
	} else {
		if ((mode & S_IFMT) == 0)
			mode |= S_IFREG;
		if ((mode & S_IALLUGO) == 0)
			mode |= S_IRUGO;
		nlink = 1;
	}

	ent = proc_create(&parent,name,mode,nlink);
	if (ent) {
		if (proc_register(parent, ent) < 0) {
			kfree(ent);
			ent = NULL;
		}
	}
	return ent;
}

void free_proc_entry(struct proc_dir_entry *de)
{
	unsigned int ino = de->low_ino;

	if (ino < PROC_DYNAMIC_FIRST)
		return;

	release_inode_number(ino);

	if (S_ISLNK(de->mode) && de->data)
		kfree(de->data);
	kfree(de);
}

/*
 * Remove a /proc entry and free it if it's not currently in use.
 */
void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
{
	struct proc_dir_entry **p;
	struct proc_dir_entry *de;
	const char *fn = name;
	int len;

	if (!parent && xlate_proc_name(name, &parent, &fn) != 0)
		goto out;
	len = strlen(fn);

	spin_lock(&proc_subdir_lock);
	for (p = &parent->subdir; *p; p=&(*p)->next ) {
		if (!proc_match(len, fn, *p))
			continue;
		de = *p;
		*p = de->next;
		de->next = NULL;

		spin_lock(&de->pde_unload_lock);
		/*
		 * Stop accepting new callers into module. If you're
		 * dynamically allocating ->proc_fops, save a pointer somewhere.
		 */
		de->proc_fops = NULL;
		/* Wait until all existing callers into module are done. */
		if (de->pde_users > 0) {
			DECLARE_COMPLETION_ONSTACK(c);

			if (!de->pde_unload_completion)
				de->pde_unload_completion = &c;

			spin_unlock(&de->pde_unload_lock);
			spin_unlock(&proc_subdir_lock);

			wait_for_completion(de->pde_unload_completion);

			spin_lock(&proc_subdir_lock);
			goto continue_removing;
		}
		spin_unlock(&de->pde_unload_lock);

continue_removing:
		if (S_ISDIR(de->mode))
			parent->nlink--;
		de->nlink = 0;
		WARN_ON(de->subdir);
		if (atomic_dec_and_test(&de->count))
			free_proc_entry(de);
		break;
	}
	spin_unlock(&proc_subdir_lock);
out:
	return;
}
Commit	Line	Data
1da177e4 LT	1	/*
	2	* proc/fs/generic.c --- generic routines for the proc-fs
	3	*
	4	* This file contains generic proc-fs routines for handling
	5	* directories and files.
	6	*
	7	* Copyright (C) 1991, 1992 Linus Torvalds.
	8	* Copyright (C) 1997 Theodore Ts'o
	9	*/
	10
	11	#include <linux/errno.h>
	12	#include <linux/time.h>
	13	#include <linux/proc_fs.h>
	14	#include <linux/stat.h>
	15	#include <linux/module.h>
	16	#include <linux/mount.h>
	17	#include <linux/smp_lock.h>
	18	#include <linux/init.h>
	19	#include <linux/idr.h>
	20	#include <linux/namei.h>
	21	#include <linux/bitops.h>
64a07bd8	22	#include <linux/spinlock.h>
786d7e16	23	#include <linux/completion.h>
1da177e4 LT	24	#include <asm/uaccess.h>
1da177e4 LT	25
fee781e6 AB	26	#include "internal.h"
fee781e6 AB	27
1da177e4 LT	28	static ssize_t proc_file_read(struct file file, char __user buf,
	29	size_t nbytes, loff_t *ppos);
	30	static ssize_t proc_file_write(struct file file, const char __user buffer,
	31	size_t count, loff_t *ppos);
	32	static loff_t proc_file_lseek(struct file *, loff_t, int);
	33
64a07bd8 SR	34	DEFINE_SPINLOCK(proc_subdir_lock);
64a07bd8 SR	35
77b14db5	36	static int proc_match(int len, const char name, struct proc_dir_entry de)
1da177e4 LT	37	{
	38	if (de->namelen != len)
	39	return 0;
	40	return !memcmp(name, de->name, len);
	41	}
	42
00977a59	43	static const struct file_operations proc_file_operations = {
1da177e4 LT	44	.llseek = proc_file_lseek,
	45	.read = proc_file_read,
	46	.write = proc_file_write,
	47	};
	48
	49	/* buffer size is one page but our output routines use some slack for overruns */
	50	#define PROC_BLOCK_SIZE (PAGE_SIZE - 1024)
	51
	52	static ssize_t
	53	proc_file_read(struct file file, char __user buf, size_t nbytes,
	54	loff_t *ppos)
	55	{
2fddfeef	56	struct inode * inode = file->f_path.dentry->d_inode;
1da177e4 LT	57	char *page;
	58	ssize_t retval=0;
	59	int eof=0;
	60	ssize_t n, count;
	61	char *start;
	62	struct proc_dir_entry * dp;
8b90db0d LT	63	unsigned long long pos;
	64
	65	/*
	66	* Gaah, please just use "seq_file" instead. The legacy /proc
	67	* interfaces cut loff_t down to off_t for reads, and ignore
	68	* the offset entirely for writes..
	69	*/
	70	pos = *ppos;
	71	if (pos > MAX_NON_LFS)
	72	return 0;
	73	if (nbytes > MAX_NON_LFS - pos)
	74	nbytes = MAX_NON_LFS - pos;
1da177e4 LT	75
1da177e4 LT	76	dp = PDE(inode);
e12ba74d	77	if (!(page = (char*) __get_free_page(GFP_TEMPORARY)))
1da177e4 LT	78	return -ENOMEM;
	79
	80	while ((nbytes > 0) && !eof) {
	81	count = min_t(size_t, PROC_BLOCK_SIZE, nbytes);
	82
	83	start = NULL;
	84	if (dp->get_info) {
	85	/* Handle old net routines */
	86	n = dp->get_info(page, &start, *ppos, count);
	87	if (n < count)
	88	eof = 1;
	89	} else if (dp->read_proc) {
	90	/*
	91	* How to be a proc read function
	92	* ------------------------------
	93	* Prototype:
	94	* int f(char buffer, char *start, off_t offset,
	95	* int count, int peof, void dat)
	96	*
	97	* Assume that the buffer is "count" bytes in size.
	98	*
	99	* If you know you have supplied all the data you
	100	* have, set *peof.
	101	*
	102	* You have three ways to return data:
	103	* 0) Leave *start = NULL. (This is the default.)
	104	* Put the data of the requested offset at that
	105	* offset within the buffer. Return the number (n)
	106	* of bytes there are from the beginning of the
	107	* buffer up to the last byte of data. If the
	108	* number of supplied bytes (= n - offset) is
	109	* greater than zero and you didn't signal eof
	110	* and the reader is prepared to take more data
	111	* you will be called again with the requested
	112	* offset advanced by the number of bytes
	113	* absorbed. This interface is useful for files
	114	* no larger than the buffer.
	115	* 1) Set *start = an unsigned long value less than
	116	* the buffer address but greater than zero.
	117	* Put the data of the requested offset at the
	118	* beginning of the buffer. Return the number of
	119	* bytes of data placed there. If this number is
	120	* greater than zero and you didn't signal eof
	121	* and the reader is prepared to take more data
	122	* you will be called again with the requested
	123	* offset advanced by *start. This interface is
	124	* useful when you have a large file consisting
	125	* of a series of blocks which you want to count
	126	* and return as wholes.
	127	* (Hack by Paul.Russell@rustcorp.com.au)
	128	* 2) Set *start = an address within the buffer.
	129	* Put the data of the requested offset at *start.
	130	* Return the number of bytes of data placed there.
	131	* If this number is greater than zero and you
	132	* didn't signal eof and the reader is prepared to
	133	* take more data you will be called again with the
	134	* requested offset advanced by the number of bytes
	135	* absorbed.
	136	*/
	137	n = dp->read_proc(page, &start, *ppos,
	138	count, &eof, dp->data);
	139	} else
	140	break;
	141
142	if (n == 0) /* end of file */
143	break;
144	if (n < 0) { /* error */
145	if (retval == 0)
146	retval = n;
147	break;
148	}
149
150	if (start == NULL) {
151	if (n > PAGE_SIZE) {
152	printk(KERN_ERR
153	"proc_file_read: Apparent buffer overflow!\n");
154	n = PAGE_SIZE;
155	}
156	n -= *ppos;
157	if (n <= 0)
158	break;
159	if (n > count)
160	n = count;
161	start = page + *ppos;
162	} else if (start < page) {
163	if (n > PAGE_SIZE) {
164	printk(KERN_ERR
165	"proc_file_read: Apparent buffer overflow!\n");
166	n = PAGE_SIZE;
167	}
168	if (n > count) {
169	/*
170	* Don't reduce n because doing so might
171	* cut off part of a data block.
172	*/
173	printk(KERN_WARNING
174	"proc_file_read: Read count exceeded\n");
175	}
176	} else /* start >= page */ {
177	unsigned long startoff = (unsigned long)(start - page);
178	if (n > (PAGE_SIZE - startoff)) {
179	printk(KERN_ERR
180	"proc_file_read: Apparent buffer overflow!\n");
181	n = PAGE_SIZE - startoff;
182	}
183	if (n > count)
184	n = count;
185	}
186
187	n -= copy_to_user(buf, start < page ? page : start, n);
188	if (n == 0) {
189	if (retval == 0)
190	retval = -EFAULT;
191	break;
192	}
193
194	*ppos += start < page ? (unsigned long)start : n;
195	nbytes -= n;
196	buf += n;
197	retval += n;
198	}
199	free_page((unsigned long) page);
200	return retval;
201	}
202
203	static ssize_t
204	proc_file_write(struct file file, const char __user buffer,
205	size_t count, loff_t *ppos)
206	{
2fddfeef	207	struct inode *inode = file->f_path.dentry->d_inode;
1da177e4 LT	208	struct proc_dir_entry * dp;
	209
	210	dp = PDE(inode);
	211
	212	if (!dp->write_proc)
	213	return -EIO;
	214
	215	/* FIXME: does this routine need ppos? probably... */
	216	return dp->write_proc(file, buffer, count, dp->data);
	217	}
	218
	219
	220	static loff_t
	221	proc_file_lseek(struct file *file, loff_t offset, int orig)
	222	{
8b90db0d LT	223	loff_t retval = -EINVAL;
	224	switch (orig) {
	225	case 1:
	226	offset += file->f_pos;
	227	/* fallthrough */
	228	case 0:
	229	if (offset < 0 \|\| offset > MAX_NON_LFS)
	230	break;
	231	file->f_pos = retval = offset;
	232	}
	233	return retval;
1da177e4 LT	234	}
	235
	236	static int proc_notify_change(struct dentry dentry, struct iattr iattr)
	237	{
	238	struct inode *inode = dentry->d_inode;
	239	struct proc_dir_entry *de = PDE(inode);
	240	int error;
	241
	242	error = inode_change_ok(inode, iattr);
	243	if (error)
	244	goto out;
	245
	246	error = inode_setattr(inode, iattr);
	247	if (error)
	248	goto out;
	249
	250	de->uid = inode->i_uid;
	251	de->gid = inode->i_gid;
	252	de->mode = inode->i_mode;
	253	out:
	254	return error;
	255	}
	256
2b579bee MS	257	static int proc_getattr(struct vfsmount mnt, struct dentry dentry,
	258	struct kstat *stat)
	259	{
	260	struct inode *inode = dentry->d_inode;
	261	struct proc_dir_entry *de = PROC_I(inode)->pde;
	262	if (de && de->nlink)
	263	inode->i_nlink = de->nlink;
	264
	265	generic_fillattr(inode, stat);
	266	return 0;
	267	}
	268
c5ef1c42	269	static const struct inode_operations proc_file_inode_operations = {
1da177e4 LT	270	.setattr = proc_notify_change,
	271	};
	272
	273	/*
	274	* This function parses a name such as "tty/driver/serial", and
	275	* returns the struct proc_dir_entry for "/proc/tty/driver", and
	276	* returns "serial" in residual.
	277	*/
	278	static int xlate_proc_name(const char *name,
	279	struct proc_dir_entry ret, const char residual)
	280	{
	281	const char cp = name, next;
	282	struct proc_dir_entry *de;
	283	int len;
64a07bd8	284	int rtn = 0;
1da177e4	285
64a07bd8	286	spin_lock(&proc_subdir_lock);
1da177e4 LT	287	de = &proc_root;
	288	while (1) {
	289	next = strchr(cp, '/');
	290	if (!next)
	291	break;
	292
	293	len = next - cp;
	294	for (de = de->subdir; de ; de = de->next) {
	295	if (proc_match(len, cp, de))
	296	break;
	297	}
64a07bd8 SR	298	if (!de) {
	299	rtn = -ENOENT;
	300	goto out;
	301	}
1da177e4 LT	302	cp += len + 1;
	303	}
	304	*residual = cp;
	305	*ret = de;
64a07bd8 SR	306	out:
	307	spin_unlock(&proc_subdir_lock);
	308	return rtn;
1da177e4 LT	309	}
	310
	311	static DEFINE_IDR(proc_inum_idr);
	312	static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
	313
	314	#define PROC_DYNAMIC_FIRST 0xF0000000UL
	315
	316	/*
	317	* Return an inode number between PROC_DYNAMIC_FIRST and
	318	* 0xffffffff, or zero on failure.
	319	*/
	320	static unsigned int get_inode_number(void)
	321	{
	322	int i, inum = 0;
	323	int error;
	324
	325	retry:
	326	if (idr_pre_get(&proc_inum_idr, GFP_KERNEL) == 0)
	327	return 0;
	328
	329	spin_lock(&proc_inum_lock);
	330	error = idr_get_new(&proc_inum_idr, NULL, &i);
	331	spin_unlock(&proc_inum_lock);
	332	if (error == -EAGAIN)
	333	goto retry;
	334	else if (error)
	335	return 0;
	336
	337	inum = (i & MAX_ID_MASK) + PROC_DYNAMIC_FIRST;
	338
	339	/* inum will never be more than 0xf0ffffff, so no check
	340	* for overflow.
	341	*/
	342
	343	return inum;
	344	}
	345
	346	static void release_inode_number(unsigned int inum)
	347	{
	348	int id = (inum - PROC_DYNAMIC_FIRST) \| ~MAX_ID_MASK;
	349
	350	spin_lock(&proc_inum_lock);
	351	idr_remove(&proc_inum_idr, id);
	352	spin_unlock(&proc_inum_lock);
	353	}
	354
008b150a	355	static void proc_follow_link(struct dentry dentry, struct nameidata *nd)
1da177e4 LT	356	{
1da177e4 LT	357	nd_set_link(nd, PDE(dentry->d_inode)->data);
008b150a	358	return NULL;
1da177e4 LT	359	}
1da177e4 LT	360
c5ef1c42	361	static const struct inode_operations proc_link_inode_operations = {
1da177e4 LT	362	.readlink = generic_readlink,
	363	.follow_link = proc_follow_link,
	364	};
	365
	366	/*
	367	* As some entries in /proc are volatile, we want to
	368	* get rid of unused dentries. This could be made
	369	* smarter: we could keep a "volatile" flag in the
	370	* inode to indicate which ones to keep.
	371	*/
	372	static int proc_delete_dentry(struct dentry * dentry)
	373	{
	374	return 1;
	375	}
	376
2b1e300a EB	377	static int proc_revalidate_dentry(struct dentry dentry, struct nameidata nd)
	378	{
	379	d_drop(dentry);
	380	return 0;
	381	}
	382
1da177e4 LT	383	static struct dentry_operations proc_dentry_operations =
	384	{
	385	.d_delete = proc_delete_dentry,
2b1e300a	386	.d_revalidate = proc_revalidate_dentry,
1da177e4 LT	387	};
	388
	389	/*
	390	* Don't create negative dentries here, return -ENOENT by hand
	391	* instead.
	392	*/
	393	struct dentry proc_lookup(struct inode dir, struct dentry dentry, struct nameidata nd)
	394	{
	395	struct inode *inode = NULL;
	396	struct proc_dir_entry * de;
	397	int error = -ENOENT;
	398
	399	lock_kernel();
64a07bd8	400	spin_lock(&proc_subdir_lock);
1da177e4 LT	401	de = PDE(dir);
	402	if (de) {
	403	for (de = de->subdir; de ; de = de->next) {
	404	if (de->namelen != dentry->d_name.len)
	405	continue;
	406	if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
2b1e300a	407	unsigned int ino;
1da177e4	408
2b1e300a EB	409	if (de->shadow_proc)
	410	de = de->shadow_proc(current, de);
	411	ino = de->low_ino;
7695650a	412	de_get(de);
64a07bd8	413	spin_unlock(&proc_subdir_lock);
1da177e4 LT	414	error = -EINVAL;
1da177e4 LT	415	inode = proc_get_inode(dir->i_sb, ino, de);
64a07bd8	416	spin_lock(&proc_subdir_lock);
1da177e4 LT	417	break;
	418	}
	419	}
	420	}
64a07bd8	421	spin_unlock(&proc_subdir_lock);
1da177e4 LT	422	unlock_kernel();
	423
	424	if (inode) {
	425	dentry->d_op = &proc_dentry_operations;
	426	d_add(dentry, inode);
	427	return NULL;
	428	}
7695650a	429	de_put(de);
1da177e4 LT	430	return ERR_PTR(error);
	431	}
	432
	433	/*
	434	* This returns non-zero if at EOF, so that the /proc
	435	* root directory can use this and check if it should
	436	* continue with the <pid> entries..
	437	*
	438	* Note that the VFS-layer doesn't care about the return
	439	* value of the readdir() call, as long as it's non-negative
	440	* for success..
	441	*/
	442	int proc_readdir(struct file * filp,
	443	void * dirent, filldir_t filldir)
	444	{
	445	struct proc_dir_entry * de;
	446	unsigned int ino;
	447	int i;
2fddfeef	448	struct inode *inode = filp->f_path.dentry->d_inode;
1da177e4 LT	449	int ret = 0;
	450
	451	lock_kernel();
	452
	453	ino = inode->i_ino;
	454	de = PDE(inode);
	455	if (!de) {
	456	ret = -EINVAL;
	457	goto out;
	458	}
	459	i = filp->f_pos;
	460	switch (i) {
	461	case 0:
	462	if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
	463	goto out;
	464	i++;
	465	filp->f_pos++;
	466	/* fall through */
	467	case 1:
	468	if (filldir(dirent, "..", 2, i,
2fddfeef	469	parent_ino(filp->f_path.dentry),
1da177e4 LT	470	DT_DIR) < 0)
	471	goto out;
	472	i++;
	473	filp->f_pos++;
	474	/* fall through */
	475	default:
64a07bd8	476	spin_lock(&proc_subdir_lock);
1da177e4 LT	477	de = de->subdir;
	478	i -= 2;
	479	for (;;) {
	480	if (!de) {
	481	ret = 1;
64a07bd8	482	spin_unlock(&proc_subdir_lock);
1da177e4 LT	483	goto out;
	484	}
	485	if (!i)
	486	break;
	487	de = de->next;
	488	i--;
	489	}
	490
	491	do {
59cd0cbc DW	492	struct proc_dir_entry *next;
59cd0cbc DW	493
64a07bd8	494	/* filldir passes info to user space */
59cd0cbc	495	de_get(de);
64a07bd8	496	spin_unlock(&proc_subdir_lock);
1da177e4	497	if (filldir(dirent, de->name, de->namelen, filp->f_pos,
59cd0cbc DW	498	de->low_ino, de->mode >> 12) < 0) {
59cd0cbc DW	499	de_put(de);
1da177e4	500	goto out;
59cd0cbc	501	}
64a07bd8	502	spin_lock(&proc_subdir_lock);
1da177e4	503	filp->f_pos++;
59cd0cbc DW	504	next = de->next;
	505	de_put(de);
	506	de = next;
1da177e4	507	} while (de);
64a07bd8	508	spin_unlock(&proc_subdir_lock);
1da177e4 LT	509	}
	510	ret = 1;
	511	out: unlock_kernel();
	512	return ret;
	513	}
	514
	515	/*
	516	* These are the generic /proc directory operations. They
	517	* use the in-memory "struct proc_dir_entry" tree to parse
	518	* the /proc directory.
	519	*/
00977a59	520	static const struct file_operations proc_dir_operations = {
1da177e4 LT	521	.read = generic_read_dir,
	522	.readdir = proc_readdir,
	523	};
	524
	525	/*
	526	* proc directories can do almost nothing..
	527	*/
c5ef1c42	528	static const struct inode_operations proc_dir_inode_operations = {
1da177e4	529	.lookup = proc_lookup,
2b579bee	530	.getattr = proc_getattr,
1da177e4 LT	531	.setattr = proc_notify_change,
	532	};
	533
	534	static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
	535	{
	536	unsigned int i;
	537
	538	i = get_inode_number();
	539	if (i == 0)
	540	return -EAGAIN;
	541	dp->low_ino = i;
64a07bd8	542
1da177e4 LT	543	if (S_ISDIR(dp->mode)) {
	544	if (dp->proc_iops == NULL) {
	545	dp->proc_fops = &proc_dir_operations;
	546	dp->proc_iops = &proc_dir_inode_operations;
	547	}
	548	dir->nlink++;
	549	} else if (S_ISLNK(dp->mode)) {
	550	if (dp->proc_iops == NULL)
	551	dp->proc_iops = &proc_link_inode_operations;
	552	} else if (S_ISREG(dp->mode)) {
	553	if (dp->proc_fops == NULL)
	554	dp->proc_fops = &proc_file_operations;
	555	if (dp->proc_iops == NULL)
	556	dp->proc_iops = &proc_file_inode_operations;
	557	}
99fc06df CG	558
	559	spin_lock(&proc_subdir_lock);
	560	dp->next = dir->subdir;
	561	dp->parent = dir;
	562	dir->subdir = dp;
	563	spin_unlock(&proc_subdir_lock);
	564
1da177e4 LT	565	return 0;
	566	}
	567
1da177e4 LT	568	static struct proc_dir_entry proc_create(struct proc_dir_entry *parent,
	569	const char *name,
	570	mode_t mode,
	571	nlink_t nlink)
	572	{
	573	struct proc_dir_entry *ent = NULL;
	574	const char *fn = name;
	575	int len;
	576
	577	/* make sure name is valid */
	578	if (!name \|\| !strlen(name)) goto out;
	579
	580	if (!(*parent) && xlate_proc_name(name, parent, &fn) != 0)
	581	goto out;
	582
	583	/* At this point there must not be any '/' characters beyond fn /
	584	if (strchr(fn, '/'))
	585	goto out;
	586
	587	len = strlen(fn);
	588
	589	ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL);
	590	if (!ent) goto out;
	591
	592	memset(ent, 0, sizeof(struct proc_dir_entry));
	593	memcpy(((char *) ent) + sizeof(struct proc_dir_entry), fn, len + 1);
	594	ent->name = ((char ) ent) + sizeof(ent);
	595	ent->namelen = len;
	596	ent->mode = mode;
	597	ent->nlink = nlink;
5a622f2d	598	atomic_set(&ent->count, 1);
786d7e16 AD	599	ent->pde_users = 0;
	600	spin_lock_init(&ent->pde_unload_lock);
	601	ent->pde_unload_completion = NULL;
1da177e4 LT	602	out:
	603	return ent;
	604	}
	605
	606	struct proc_dir_entry proc_symlink(const char name,
	607	struct proc_dir_entry parent, const char dest)
	608	{
	609	struct proc_dir_entry *ent;
	610
	611	ent = proc_create(&parent,name,
	612	(S_IFLNK \| S_IRUGO \| S_IWUGO \| S_IXUGO),1);
	613
	614	if (ent) {
	615	ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL);
	616	if (ent->data) {
	617	strcpy((char*)ent->data,dest);
	618	if (proc_register(parent, ent) < 0) {
	619	kfree(ent->data);
	620	kfree(ent);
	621	ent = NULL;
	622	}
	623	} else {
	624	kfree(ent);
	625	ent = NULL;
	626	}
	627	}
	628	return ent;
	629	}
	630
	631	struct proc_dir_entry proc_mkdir_mode(const char name, mode_t mode,
	632	struct proc_dir_entry *parent)
	633	{
	634	struct proc_dir_entry *ent;
	635
	636	ent = proc_create(&parent, name, S_IFDIR \| mode, 2);
	637	if (ent) {
1da177e4 LT	638	if (proc_register(parent, ent) < 0) {
	639	kfree(ent);
	640	ent = NULL;
	641	}
	642	}
	643	return ent;
	644	}
	645
	646	struct proc_dir_entry proc_mkdir(const char name,
	647	struct proc_dir_entry *parent)
	648	{
	649	return proc_mkdir_mode(name, S_IRUGO \| S_IXUGO, parent);
	650	}
	651
	652	struct proc_dir_entry create_proc_entry(const char name, mode_t mode,
	653	struct proc_dir_entry *parent)
	654	{
	655	struct proc_dir_entry *ent;
	656	nlink_t nlink;
	657
	658	if (S_ISDIR(mode)) {
	659	if ((mode & S_IALLUGO) == 0)
	660	mode \|= S_IRUGO \| S_IXUGO;
	661	nlink = 2;
	662	} else {
	663	if ((mode & S_IFMT) == 0)
	664	mode \|= S_IFREG;
	665	if ((mode & S_IALLUGO) == 0)
	666	mode \|= S_IRUGO;
	667	nlink = 1;
	668	}
	669
	670	ent = proc_create(&parent,name,mode,nlink);
	671	if (ent) {
1da177e4 LT	672	if (proc_register(parent, ent) < 0) {
	673	kfree(ent);
	674	ent = NULL;
	675	}
	676	}
	677	return ent;
	678	}
	679
	680	void free_proc_entry(struct proc_dir_entry *de)
	681	{
	682	unsigned int ino = de->low_ino;
	683
	684	if (ino < PROC_DYNAMIC_FIRST)
	685	return;
	686
	687	release_inode_number(ino);
	688
	689	if (S_ISLNK(de->mode) && de->data)
	690	kfree(de->data);
	691	kfree(de);
	692	}
	693
	694	/*
	695	* Remove a /proc entry and free it if it's not currently in use.
1da177e4 LT	696	*/
	697	void remove_proc_entry(const char name, struct proc_dir_entry parent)
	698	{
	699	struct proc_dir_entry **p;
	700	struct proc_dir_entry *de;
	701	const char *fn = name;
	702	int len;
	703
	704	if (!parent && xlate_proc_name(name, &parent, &fn) != 0)
	705	goto out;
	706	len = strlen(fn);
64a07bd8 SR	707
64a07bd8 SR	708	spin_lock(&proc_subdir_lock);
1da177e4 LT	709	for (p = &parent->subdir; p; p=&(p)->next ) {
	710	if (!proc_match(len, fn, *p))
	711	continue;
	712	de = *p;
	713	*p = de->next;
	714	de->next = NULL;
786d7e16 AD	715
	716	spin_lock(&de->pde_unload_lock);
	717	/*
	718	* Stop accepting new callers into module. If you're
	719	* dynamically allocating ->proc_fops, save a pointer somewhere.
	720	*/
	721	de->proc_fops = NULL;
	722	/* Wait until all existing callers into module are done. */
	723	if (de->pde_users > 0) {
	724	DECLARE_COMPLETION_ONSTACK(c);
	725
	726	if (!de->pde_unload_completion)
	727	de->pde_unload_completion = &c;
	728
	729	spin_unlock(&de->pde_unload_lock);
	730	spin_unlock(&proc_subdir_lock);
	731
	732	wait_for_completion(de->pde_unload_completion);
	733
	734	spin_lock(&proc_subdir_lock);
	735	goto continue_removing;
	736	}
	737	spin_unlock(&de->pde_unload_lock);
	738
	739	continue_removing:
1da177e4 LT	740	if (S_ISDIR(de->mode))
1da177e4 LT	741	parent->nlink--;
1da177e4 LT	742	de->nlink = 0;
1da177e4 LT	743	WARN_ON(de->subdir);
5a622f2d	744	if (atomic_dec_and_test(&de->count))
1da177e4	745	free_proc_entry(de);
1da177e4 LT	746	break;
1da177e4 LT	747	}
64a07bd8	748	spin_unlock(&proc_subdir_lock);
1da177e4 LT	749	out:
	750	return;
	751	}