Commit e9be9d5e authored by Miklos Szeredi's avatar Miklos Szeredi

overlay filesystem

Overlayfs allows one, usually read-write, directory tree to be
overlaid onto another, read-only directory tree.  All modifications
go to the upper, writable layer.

This type of mechanism is most often used for live CDs but there's a
wide variety of other uses.

The implementation differs from other "union filesystem"
implementations in that after a file is opened all operations go
directly to the underlying, lower or upper, filesystems.  This
simplifies the implementation and allows native performance in these
cases.

The dentry tree is duplicated from the underlying filesystems, this
enables fast cached lookups without adding special support into the
VFS.  This uses slightly more memory than union mounts, but dentries
are relatively small.

Currently inodes are duplicated as well, but it is a possible
optimization to share inodes for non-directories.

Opening non directories results in the open forwarded to the
underlying filesystem.  This makes the behavior very similar to union
mounts (with the same limitations vs. fchmod/fchown on O_RDONLY file
descriptors).

Usage:

  mount -t overlayfs overlayfs -olowerdir=/lower,upperdir=/upper/upper,workdir=/upper/work /overlay

The following cotributions have been folded into this patch:

Neil Brown <neilb@suse.de>:
 - minimal remount support
 - use correct seek function for directories
 - initialise is_real before use
 - rename ovl_fill_cache to ovl_dir_read

Felix Fietkau <nbd@openwrt.org>:
 - fix a deadlock in ovl_dir_read_merged
 - fix a deadlock in ovl_remove_whiteouts

Erez Zadok <ezk@fsl.cs.sunysb.edu>
 - fix cleanup after WARN_ON

Sedat Dilek <sedat.dilek@googlemail.com>
 - fix up permission to confirm to new API

Robin Dong <hao.bigrat@gmail.com>
 - fix possible leak in ovl_new_inode
 - create new inode in ovl_link

Andy Whitcroft <apw@canonical.com>
 - switch to __inode_permission()
 - copy up i_uid/i_gid from the underlying inode

AV:
 - ovl_copy_up_locked() - dput(ERR_PTR(...)) on two failure exits
 - ovl_clear_empty() - one failure exit forgetting to do unlock_rename(),
   lack of check for udir being the parent of upper, dropping and regaining
   the lock on udir (which would require _another_ check for parent being
   right).
 - bogus d_drop() in copyup and rename [fix from your mail]
 - copyup/remove and copyup/rename races [fix from your mail]
 - ovl_dir_fsync() leaving ERR_PTR() in ->realfile
 - ovl_entry_free() is pointless - it's just a kfree_rcu()
 - fold ovl_do_lookup() into ovl_lookup()
 - manually assigning ->d_op is wrong.  Just use ->s_d_op.
 [patches picked from Miklos]:
 * copyup/remove and copyup/rename races
 * bogus d_drop() in copyup and rename

Also thanks to the following people for testing and reporting bugs:

  Jordi Pujol <jordipujolp@gmail.com>
  Andy Whitcroft <apw@canonical.com>
  Michal Suchanek <hramrach@centrum.cz>
  Felix Fietkau <nbd@openwrt.org>
  Erez Zadok <ezk@fsl.cs.sunysb.edu>
  Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: default avatarMiklos Szeredi <mszeredi@suse.cz>
parent 46fdb794
......@@ -67,6 +67,7 @@ source "fs/quota/Kconfig"
source "fs/autofs4/Kconfig"
source "fs/fuse/Kconfig"
source "fs/overlayfs/Kconfig"
menu "Caches"
......
......@@ -104,6 +104,7 @@ obj-$(CONFIG_QNX6FS_FS) += qnx6/
obj-$(CONFIG_AUTOFS4_FS) += autofs4/
obj-$(CONFIG_ADFS_FS) += adfs/
obj-$(CONFIG_FUSE_FS) += fuse/
obj-$(CONFIG_OVERLAYFS_FS) += overlayfs/
obj-$(CONFIG_UDF_FS) += udf/
obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/
obj-$(CONFIG_OMFS_FS) += omfs/
......
config OVERLAYFS_FS
tristate "Overlay filesystem support"
help
An overlay filesystem combines two filesystems - an 'upper' filesystem
and a 'lower' filesystem. When a name exists in both filesystems, the
object in the 'upper' filesystem is visible while the object in the
'lower' filesystem is either hidden or, in the case of directories,
merged with the 'upper' object.
For more information see Documentation/filesystems/overlayfs.txt
#
# Makefile for the overlay filesystem.
#
obj-$(CONFIG_OVERLAYFS_FS) += overlayfs.o
overlayfs-objs := super.o inode.o dir.o readdir.o copy_up.o
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/splice.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/uaccess.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include "overlayfs.h"
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
{
ssize_t list_size, size;
char *buf, *name, *value;
int error;
if (!old->d_inode->i_op->getxattr ||
!new->d_inode->i_op->getxattr)
return 0;
list_size = vfs_listxattr(old, NULL, 0);
if (list_size <= 0) {
if (list_size == -EOPNOTSUPP)
return 0;
return list_size;
}
buf = kzalloc(list_size, GFP_KERNEL);
if (!buf)
return -ENOMEM;
error = -ENOMEM;
value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL);
if (!value)
goto out;
list_size = vfs_listxattr(old, buf, list_size);
if (list_size <= 0) {
error = list_size;
goto out_free_value;
}
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX);
if (size <= 0) {
error = size;
goto out_free_value;
}
error = vfs_setxattr(new, name, value, size, 0);
if (error)
goto out_free_value;
}
out_free_value:
kfree(value);
out:
kfree(buf);
return error;
}
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
{
struct file *old_file;
struct file *new_file;
loff_t old_pos = 0;
loff_t new_pos = 0;
int error = 0;
if (len == 0)
return 0;
old_file = ovl_path_open(old, O_RDONLY);
if (IS_ERR(old_file))
return PTR_ERR(old_file);
new_file = ovl_path_open(new, O_WRONLY);
if (IS_ERR(new_file)) {
error = PTR_ERR(new_file);
goto out_fput;
}
/* FIXME: copy up sparse files efficiently */
while (len) {
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
long bytes;
if (len < this_len)
this_len = len;
if (signal_pending_state(TASK_KILLABLE, current)) {
error = -EINTR;
break;
}
bytes = do_splice_direct(old_file, &old_pos,
new_file, &new_pos,
this_len, SPLICE_F_MOVE);
if (bytes <= 0) {
error = bytes;
break;
}
WARN_ON(old_pos != new_pos);
len -= bytes;
}
fput(new_file);
out_fput:
fput(old_file);
return error;
}
static char *ovl_read_symlink(struct dentry *realdentry)
{
int res;
char *buf;
struct inode *inode = realdentry->d_inode;
mm_segment_t old_fs;
res = -EINVAL;
if (!inode->i_op->readlink)
goto err;
res = -ENOMEM;
buf = (char *) __get_free_page(GFP_KERNEL);
if (!buf)
goto err;
old_fs = get_fs();
set_fs(get_ds());
/* The cast to a user pointer is valid due to the set_fs() */
res = inode->i_op->readlink(realdentry,
(char __user *)buf, PAGE_SIZE - 1);
set_fs(old_fs);
if (res < 0) {
free_page((unsigned long) buf);
goto err;
}
buf[res] = '\0';
return buf;
err:
return ERR_PTR(res);
}
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
{
struct iattr attr = {
.ia_valid =
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
.ia_atime = stat->atime,
.ia_mtime = stat->mtime,
};
return notify_change(upperdentry, &attr, NULL);
}
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
{
int err = 0;
if (!S_ISLNK(stat->mode)) {
struct iattr attr = {
.ia_valid = ATTR_MODE,
.ia_mode = stat->mode,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err) {
struct iattr attr = {
.ia_valid = ATTR_UID | ATTR_GID,
.ia_uid = stat->uid,
.ia_gid = stat->gid,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err)
ovl_set_timestamps(upperdentry, stat);
return err;
}
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
struct dentry *dentry, struct path *lowerpath,
struct kstat *stat, struct iattr *attr,
const char *link)
{
struct inode *wdir = workdir->d_inode;
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry = NULL;
struct dentry *upper = NULL;
umode_t mode = stat->mode;
int err;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out1;
/* Can't properly set mode on creation because of the umask */
stat->mode &= S_IFMT;
err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
stat->mode = mode;
if (err)
goto out2;
if (S_ISREG(stat->mode)) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
BUG_ON(upperpath.dentry != NULL);
upperpath.dentry = newdentry;
err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
if (err)
goto out_cleanup;
}
err = ovl_copy_xattr(lowerpath->dentry, newdentry);
if (err)
goto out_cleanup;
mutex_lock(&newdentry->d_inode->i_mutex);
err = ovl_set_attr(newdentry, stat);
if (!err && attr)
err = notify_change(newdentry, attr, NULL);
mutex_unlock(&newdentry->d_inode->i_mutex);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
ovl_dentry_update(dentry, newdentry);
newdentry = NULL;
/*
* Non-directores become opaque when copied up.
*/
if (!S_ISDIR(stat->mode))
ovl_dentry_set_opaque(dentry, true);
out2:
dput(upper);
out1:
dput(newdentry);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out;
}
/*
* Copy up a single dentry
*
* Directory renames only allowed on "pure upper" (already created on
* upper filesystem, never copied up). Directories which are on lower or
* are merged may not be renamed. For these -EXDEV is returned and
* userspace has to deal with it. This means, when copying up a
* directory we can rely on it and ancestors being stable.
*
* Non-directory renames start with copy up of source if necessary. The
* actual rename will only proceed once the copy up was successful. Copy
* up uses upper parent i_mutex for exclusion. Since rename can change
* d_parent it is possible that the copy up will lock the old parent. At
* that point the file will have already been copied up anyway.
*/
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat,
struct iattr *attr)
{
struct dentry *workdir = ovl_workdir(dentry);
int err;
struct kstat pstat;
struct path parentpath;
struct dentry *upperdir;
struct dentry *upperdentry;
const struct cred *old_cred;
struct cred *override_cred;
char *link = NULL;
ovl_path_upper(parent, &parentpath);
upperdir = parentpath.dentry;
err = vfs_getattr(&parentpath, &pstat);
if (err)
return err;
if (S_ISLNK(stat->mode)) {
link = ovl_read_symlink(lowerpath->dentry);
if (IS_ERR(link))
return PTR_ERR(link);
}
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_free_link;
override_cred->fsuid = stat->uid;
override_cred->fsgid = stat->gid;
/*
* CAP_SYS_ADMIN for copying up extended attributes
* CAP_DAC_OVERRIDE for create
* CAP_FOWNER for chmod, timestamp update
* CAP_FSETID for chmod
* CAP_CHOWN for chown
* CAP_MKNOD for mknod
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
cap_raise(override_cred->cap_effective, CAP_MKNOD);
old_cred = override_creds(override_cred);
err = -EIO;
if (lock_rename(workdir, upperdir) != NULL) {
pr_err("overlayfs: failed to lock workdir+upperdir\n");
goto out_unlock;
}
upperdentry = ovl_dentry_upper(dentry);
if (upperdentry) {
unlock_rename(workdir, upperdir);
err = 0;
/* Raced with another copy-up? Do the setattr here */
if (attr) {
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
mutex_unlock(&upperdentry->d_inode->i_mutex);
}
goto out_put_cred;
}
err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
stat, attr, link);
if (!err) {
/* Restore timestamps on parent (best effort) */
ovl_set_timestamps(upperdir, &pstat);
}
out_unlock:
unlock_rename(workdir, upperdir);
out_put_cred:
revert_creds(old_cred);
put_cred(override_cred);
out_free_link:
if (link)
free_page((unsigned long) link);
return err;
}
int ovl_copy_up(struct dentry *dentry)
{
int err;
err = 0;
while (!err) {
struct dentry *next;
struct dentry *parent;
struct path lowerpath;
struct kstat stat;
enum ovl_path_type type = ovl_path_type(dentry);
if (type != OVL_PATH_LOWER)
break;
next = dget(dentry);
/* find the topmost dentry not yet copied up */
for (;;) {
parent = dget_parent(next);
type = ovl_path_type(parent);
if (type != OVL_PATH_LOWER)
break;
dput(next);
next = parent;
}
ovl_path_lower(next, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (!err)
err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
dput(parent);
dput(next);
}
return err;
}
This diff is collapsed.
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include "overlayfs.h"
static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
bool no_data)
{
int err;
struct dentry *parent;
struct kstat stat;
struct path lowerpath;
parent = dget_parent(dentry);
err = ovl_copy_up(parent);
if (err)
goto out_dput_parent;
ovl_path_lower(dentry, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (err)
goto out_dput_parent;
if (no_data)
stat.size = 0;
err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
out_dput_parent:
dput(parent);
return err;
}
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
{
int err;
struct dentry *upperdentry;
err = ovl_want_write(dentry);
if (err)
goto out;
upperdentry = ovl_dentry_upper(dentry);
if (upperdentry) {
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
mutex_unlock(&upperdentry->d_inode->i_mutex);
} else {
err = ovl_copy_up_last(dentry, attr, false);
}
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct path realpath;
ovl_path_real(dentry, &realpath);
return vfs_getattr(&realpath, stat);
}
int ovl_permission(struct inode *inode, int mask)
{
struct ovl_entry *oe;
struct dentry *alias = NULL;
struct inode *realinode;
struct dentry *realdentry;
bool is_upper;
int err;
if (S_ISDIR(inode->i_mode)) {
oe = inode->i_private;
} else if (mask & MAY_NOT_BLOCK) {
return -ECHILD;
} else {
/*
* For non-directories find an alias and get the info
* from there.
*/
alias = d_find_any_alias(inode);
if (WARN_ON(!alias))
return -ENOENT;
oe = alias->d_fsdata;
}
realdentry = ovl_entry_real(oe, &is_upper);
/* Careful in RCU walk mode */
realinode = ACCESS_ONCE(realdentry->d_inode);
if (!realinode) {
WARN_ON(!(mask & MAY_NOT_BLOCK));
err = -ENOENT;
goto out_dput;
}
if (mask & MAY_WRITE) {
umode_t mode = realinode->i_mode;
/*
* Writes will always be redirected to upper layer, so
* ignore lower layer being read-only.
*
* If the overlay itself is read-only then proceed
* with the permission check, don't return EROFS.
* This will only happen if this is the lower layer of
* another overlayfs.
*
* If upper fs becomes read-only after the overlay was
* constructed return EROFS to prevent modification of
* upper layer.
*/
err = -EROFS;
if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
goto out_dput;
}
err = __inode_permission(realinode, mask);
out_dput:
dput(alias);
return err;
}
struct ovl_link_data {
struct dentry *realdentry;
void *cookie;
};
static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
{
void *ret;
struct dentry *realdentry;
struct inode *realinode;
realdentry = ovl_dentry_real(dentry);
realinode = realdentry->d_inode;
if (WARN_ON(!realinode->i_op->follow_link))
return ERR_PTR(-EPERM);
ret = realinode->i_op->follow_link(realdentry, nd);
if (IS_ERR(ret))
return ret;
if (realinode->i_op->put_link) {
struct ovl_link_data *data;
data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
if (!data) {
realinode->i_op->put_link(realdentry, nd, ret);
return ERR_PTR(-ENOMEM);
}
data->realdentry = realdentry;
data->cookie = ret;
return data;
} else {
return NULL;
}
}
static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
{
struct inode *realinode;
struct ovl_link_data *data = c;
if (!data)
return;
realinode = data->realdentry->d_inode;
realinode->i_op->put_link(data->realdentry, nd, data->cookie);
kfree(data);
}
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
{
struct path realpath;
struct inode *realinode;
ovl_path_real(dentry, &realpath);
realinode = realpath.dentry->d_inode;
if (!realinode->i_op->readlink)
return -EINVAL;
touch_atime(&realpath);