This commit was manufactured by cvs2svn to create branch 'vserver'.
authorPlanet-Lab Support <support@planet-lab.org>
Tue, 13 Jul 2004 17:57:18 +0000 (17:57 +0000)
committerPlanet-Lab Support <support@planet-lab.org>
Tue, 13 Jul 2004 17:57:18 +0000 (17:57 +0000)
33 files changed:
arch/um/drivers/cow.h [new file with mode: 0644]
arch/um/drivers/cow_user.c [new file with mode: 0644]
arch/um/include/irq_kern.h [new file with mode: 0644]
arch/um/include/mem_kern.h [new file with mode: 0644]
arch/um/kernel/physmem.c [new file with mode: 0644]
arch/um/kernel/skas/uaccess.c [new file with mode: 0644]
arch/um/kernel/tt/uaccess.c [new file with mode: 0644]
arch/um/os-Linux/user_syms.c [new file with mode: 0644]
fs/hostfs/Makefile [new file with mode: 0644]
fs/hostfs/hostfs.h [new file with mode: 0644]
fs/hostfs/hostfs_kern.c [new file with mode: 0644]
fs/hostfs/hostfs_user.c [new file with mode: 0644]
fs/hppfs/Makefile [new file with mode: 0644]
fs/hppfs/hppfs_kern.c [new file with mode: 0644]
fs/relayfs/Makefile [new file with mode: 0644]
fs/relayfs/inode.c [new file with mode: 0644]
fs/relayfs/relay.c [new file with mode: 0644]
include/asm-um/cpufeature.h [new file with mode: 0644]
include/asm-um/local.h [new file with mode: 0644]
include/asm-um/module-generic.h [new file with mode: 0644]
include/asm-um/sections.h [new file with mode: 0644]
include/linux/relayfs_fs.h [new file with mode: 0644]
include/linux/vs_base.h [new file with mode: 0644]
include/linux/vs_context.h [new file with mode: 0644]
include/linux/vs_cvirt.h [new file with mode: 0644]
include/linux/vs_dlimit.h [new file with mode: 0644]
include/linux/vs_limit.h [new file with mode: 0644]
include/linux/vs_memory.h [new file with mode: 0644]
include/linux/vs_network.h [new file with mode: 0644]
include/linux/vs_socket.h [new file with mode: 0644]
include/linux/vserver/dlimit.h [new file with mode: 0644]
kernel/vserver/dlimit.c [new file with mode: 0644]
kernel/vserver/helper.c [new file with mode: 0644]

diff --git a/arch/um/drivers/cow.h b/arch/um/drivers/cow.h
new file mode 100644 (file)
index 0000000..d875d04
--- /dev/null
@@ -0,0 +1,41 @@
+#ifndef __COW_H__
+#define __COW_H__
+
+#include <asm/types.h>
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+# define ntohll(x) (x)
+# define htonll(x) (x)
+#elif __BYTE_ORDER == __LITTLE_ENDIAN
+# define ntohll(x)  bswap_64(x)
+# define htonll(x)  bswap_64(x)
+#else
+#error "__BYTE_ORDER not defined"
+#endif
+
+extern int init_cow_file(int fd, char *cow_file, char *backing_file, 
+                        int sectorsize, int alignment, int *bitmap_offset_out, 
+                        unsigned long *bitmap_len_out, int *data_offset_out);
+
+extern int file_reader(__u64 offset, char *buf, int len, void *arg);
+extern int read_cow_header(int (*reader)(__u64, char *, int, void *), 
+                          void *arg, __u32 *version_out, 
+                          char **backing_file_out, time_t *mtime_out, 
+                          __u64 *size_out, int *sectorsize_out, 
+                          __u32 *align_out, int *bitmap_offset_out);
+
+extern int write_cow_header(char *cow_file, int fd, char *backing_file, 
+                           int sectorsize, int alignment, long long *size);
+
+extern void cow_sizes(int version, __u64 size, int sectorsize, int align,
+                     int bitmap_offset, unsigned long *bitmap_len_out, 
+                     int *data_offset_out);
+
+#endif
+
+/*
+ * ---------------------------------------------------------------------------
+ * Local variables:
+ * c-file-style: "linux"
+ * End:
+ */
diff --git a/arch/um/drivers/cow_user.c b/arch/um/drivers/cow_user.c
new file mode 100644 (file)
index 0000000..014c2c8
--- /dev/null
@@ -0,0 +1,375 @@
+#include <stddef.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <byteswap.h>
+#include <sys/time.h>
+#include <sys/param.h>
+#include <sys/user.h>
+#include <netinet/in.h>
+
+#include "os.h"
+
+#include "cow.h"
+#include "cow_sys.h"
+
+#define PATH_LEN_V1 256
+
+struct cow_header_v1 {
+       int magic;
+       int version;
+       char backing_file[PATH_LEN_V1];
+       time_t mtime;
+       __u64 size;
+       int sectorsize;
+};
+
+#define PATH_LEN_V2 MAXPATHLEN
+
+struct cow_header_v2 {
+       unsigned long magic;
+       unsigned long version;
+       char backing_file[PATH_LEN_V2];
+       time_t mtime;
+       __u64 size;
+       int sectorsize;
+};
+
+/* Define PATH_LEN_V3 as the usual value of MAXPATHLEN, just hard-code it in 
+ * case other systems have different values for MAXPATHLEN
+ */
+#define PATH_LEN_V3 4096
+
+/* Changes from V2 - 
+ *     PATH_LEN_V3 as described above
+ *     Explicitly specify field bit lengths for systems with different
+ *             lengths for the usual C types.  Not sure whether char or
+ *             time_t should be changed, this can be changed later without
+ *             breaking compatibility
+ *     Add alignment field so that different alignments can be used for the
+ *             bitmap and data
+ *     Add cow_format field to allow for the possibility of different ways
+ *             of specifying the COW blocks.  For now, the only value is 0,
+ *             for the traditional COW bitmap.
+ *     Move the backing_file field to the end of the header.  This allows
+ *             for the possibility of expanding it into the padding required
+ *             by the bitmap alignment.
+ *     The bitmap and data portions of the file will be aligned as specified
+ *             by the alignment field.  This is to allow COW files to be
+ *             put on devices with restrictions on access alignments, such as
+ *             /dev/raw, with a 512 byte alignment restriction.  This also
+ *             allows the data to be more aligned more strictly than on
+ *             sector boundaries.  This is needed for ubd-mmap, which needs
+ *             the data to be page aligned.
+ *     Fixed (finally!) the rounding bug
+ */
+
+struct cow_header_v3 {
+       __u32 magic;
+       __u32 version;
+       time_t mtime;
+       __u64 size;
+       __u32 sectorsize;
+       __u32 alignment;
+       __u32 cow_format;
+       char backing_file[PATH_LEN_V3];
+};
+
+/* COW format definitions - for now, we have only the usual COW bitmap */
+#define COW_BITMAP 0
+
+union cow_header {
+       struct cow_header_v1 v1;
+       struct cow_header_v2 v2;
+       struct cow_header_v3 v3;
+};
+
+#define COW_MAGIC 0x4f4f4f4d  /* MOOO */
+#define COW_VERSION 3
+
+#define DIV_ROUND(x, len) (((x) + (len) - 1) / (len))
+#define ROUND_UP(x, align) DIV_ROUND(x, align) * (align)
+
+void cow_sizes(int version, __u64 size, int sectorsize, int align, 
+              int bitmap_offset, unsigned long *bitmap_len_out, 
+              int *data_offset_out)
+{
+       if(version < 3){
+               *bitmap_len_out = (size + sectorsize - 1) / (8 * sectorsize);
+
+               *data_offset_out = bitmap_offset + *bitmap_len_out;
+               *data_offset_out = (*data_offset_out + sectorsize - 1) / 
+                       sectorsize;
+               *data_offset_out *= sectorsize;
+       }
+       else {
+               *bitmap_len_out = DIV_ROUND(size, sectorsize);
+               *bitmap_len_out = DIV_ROUND(*bitmap_len_out, 8);
+
+               *data_offset_out = bitmap_offset + *bitmap_len_out;
+               *data_offset_out = ROUND_UP(*data_offset_out, align);
+       }
+}
+
+static int absolutize(char *to, int size, char *from)
+{
+       char save_cwd[256], *slash;
+       int remaining;
+
+       if(getcwd(save_cwd, sizeof(save_cwd)) == NULL) {
+               cow_printf("absolutize : unable to get cwd - errno = %d\n", 
+                          errno);
+               return(-1);
+       }
+       slash = strrchr(from, '/');
+       if(slash != NULL){
+               *slash = '\0';
+               if(chdir(from)){
+                       *slash = '/';
+                       cow_printf("absolutize : Can't cd to '%s' - " 
+                                  "errno = %d\n", from, errno);
+                       return(-1);
+               }
+               *slash = '/';
+               if(getcwd(to, size) == NULL){
+                       cow_printf("absolutize : unable to get cwd of '%s' - "
+                              "errno = %d\n", from, errno);
+                       return(-1);
+               }
+               remaining = size - strlen(to);
+               if(strlen(slash) + 1 > remaining){
+                       cow_printf("absolutize : unable to fit '%s' into %d "
+                              "chars\n", from, size);
+                       return(-1);
+               }
+               strcat(to, slash);
+       }
+       else {
+               if(strlen(save_cwd) + 1 + strlen(from) + 1 > size){
+                       cow_printf("absolutize : unable to fit '%s' into %d "
+                              "chars\n", from, size);
+                       return(-1);
+               }
+               strcpy(to, save_cwd);
+               strcat(to, "/");
+               strcat(to, from);
+       }
+       chdir(save_cwd);
+       return(0);
+}
+
+int write_cow_header(char *cow_file, int fd, char *backing_file, 
+                    int sectorsize, int alignment, long long *size)
+{
+       struct cow_header_v3 *header;
+       unsigned long modtime;
+       int err;
+
+       err = cow_seek_file(fd, 0);
+       if(err < 0){
+               cow_printf("write_cow_header - lseek failed, err = %d\n", -err);
+               goto out;
+       }
+
+       err = -ENOMEM;
+       header = cow_malloc(sizeof(*header));
+       if(header == NULL){
+               cow_printf("Failed to allocate COW V3 header\n");
+               goto out;
+       }
+       header->magic = htonl(COW_MAGIC);
+       header->version = htonl(COW_VERSION);
+
+       err = -EINVAL;
+       if(strlen(backing_file) > sizeof(header->backing_file) - 1){
+               cow_printf("Backing file name \"%s\" is too long - names are "
+                          "limited to %d characters\n", backing_file, 
+                          sizeof(header->backing_file) - 1);
+               goto out_free;
+       }
+
+       if(absolutize(header->backing_file, sizeof(header->backing_file), 
+                     backing_file))
+               goto out_free;
+
+       err = os_file_modtime(header->backing_file, &modtime);
+       if(err < 0){
+               cow_printf("Backing file '%s' mtime request failed, "
+                          "err = %d\n", header->backing_file, -err);
+               goto out_free;
+       }
+
+       err = cow_file_size(header->backing_file, size);
+       if(err < 0){
+               cow_printf("Couldn't get size of backing file '%s', "
+                          "err = %d\n", header->backing_file, -err);
+               goto out_free;
+       }
+
+       header->mtime = htonl(modtime);
+       header->size = htonll(*size);
+       header->sectorsize = htonl(sectorsize);
+       header->alignment = htonl(alignment);
+       header->cow_format = COW_BITMAP;
+
+       err = os_write_file(fd, header, sizeof(*header));
+       if(err != sizeof(*header)){
+               cow_printf("Write of header to new COW file '%s' failed, "
+                          "err = %d\n", cow_file, -err);
+               goto out_free;
+       }
+       err = 0;
+ out_free:
+       cow_free(header);
+ out:
+       return(err);
+}
+
+int file_reader(__u64 offset, char *buf, int len, void *arg)
+{
+       int fd = *((int *) arg);
+
+       return(pread(fd, buf, len, offset));
+}
+
+/* XXX Need to sanity-check the values read from the header */
+
+int read_cow_header(int (*reader)(__u64, char *, int, void *), void *arg, 
+                   __u32 *version_out, char **backing_file_out, 
+                   time_t *mtime_out, __u64 *size_out, 
+                   int *sectorsize_out, __u32 *align_out, 
+                   int *bitmap_offset_out)
+{
+       union cow_header *header;
+       char *file;
+       int err, n;
+       unsigned long version, magic;
+
+       header = cow_malloc(sizeof(*header));
+       if(header == NULL){
+               cow_printf("read_cow_header - Failed to allocate header\n");
+               return(-ENOMEM);
+       }
+       err = -EINVAL;
+       n = (*reader)(0, (char *) header, sizeof(*header), arg);
+       if(n < offsetof(typeof(header->v1), backing_file)){
+               cow_printf("read_cow_header - short header\n");
+               goto out;
+       }
+
+       magic = header->v1.magic;
+       if(magic == COW_MAGIC) {
+               version = header->v1.version;
+       }
+       else if(magic == ntohl(COW_MAGIC)){
+               version = ntohl(header->v1.version);
+       }
+       /* No error printed because the non-COW case comes through here */
+       else goto out;
+
+       *version_out = version;
+
+       if(version == 1){
+               if(n < sizeof(header->v1)){
+                       cow_printf("read_cow_header - failed to read V1 "
+                                  "header\n");
+                       goto out;
+               }
+               *mtime_out = header->v1.mtime;
+               *size_out = header->v1.size;
+               *sectorsize_out = header->v1.sectorsize;
+               *bitmap_offset_out = sizeof(header->v1);
+               *align_out = *sectorsize_out;
+               file = header->v1.backing_file;
+       }
+       else if(version == 2){
+               if(n < sizeof(header->v2)){
+                       cow_printf("read_cow_header - failed to read V2 "
+                                  "header\n");
+                       goto out;
+               }
+               *mtime_out = ntohl(header->v2.mtime);
+               *size_out = ntohll(header->v2.size);
+               *sectorsize_out = ntohl(header->v2.sectorsize);
+               *bitmap_offset_out = sizeof(header->v2);
+               *align_out = *sectorsize_out;
+               file = header->v2.backing_file;
+       }
+       else if(version == 3){
+               if(n < sizeof(header->v3)){
+                       cow_printf("read_cow_header - failed to read V2 "
+                                  "header\n");
+                       goto out;
+               }
+               *mtime_out = ntohl(header->v3.mtime);
+               *size_out = ntohll(header->v3.size);
+               *sectorsize_out = ntohl(header->v3.sectorsize);
+               *align_out = ntohl(header->v3.alignment);
+               *bitmap_offset_out = ROUND_UP(sizeof(header->v3), *align_out);
+               file = header->v3.backing_file;
+       }
+       else {
+               cow_printf("read_cow_header - invalid COW version\n");
+               goto out;               
+       }
+       err = -ENOMEM;
+       *backing_file_out = cow_strdup(file);
+       if(*backing_file_out == NULL){
+               cow_printf("read_cow_header - failed to allocate backing "
+                          "file\n");
+               goto out;
+       }
+       err = 0;
+ out:
+       cow_free(header);
+       return(err);
+}
+
+int init_cow_file(int fd, char *cow_file, char *backing_file, int sectorsize,
+                 int alignment, int *bitmap_offset_out, 
+                 unsigned long *bitmap_len_out, int *data_offset_out)
+{
+       __u64 size, offset;
+       char zero = 0;
+       int err;
+
+       err = write_cow_header(cow_file, fd, backing_file, sectorsize, 
+                              alignment, &size);
+       if(err) 
+               goto out;
+       
+       *bitmap_offset_out = ROUND_UP(sizeof(struct cow_header_v3), alignment);
+       cow_sizes(COW_VERSION, size, sectorsize, alignment, *bitmap_offset_out,
+                 bitmap_len_out, data_offset_out);
+
+       offset = *data_offset_out + size - sizeof(zero);
+       err = cow_seek_file(fd, offset);
+       if(err < 0){
+               cow_printf("cow bitmap lseek failed : err = %d\n", -err);
+               goto out;
+       }
+
+       /* does not really matter how much we write it is just to set EOF 
+        * this also sets the entire COW bitmap
+        * to zero without having to allocate it 
+        */
+       err = cow_write_file(fd, &zero, sizeof(zero));
+       if(err != sizeof(zero)){
+               cow_printf("Write of bitmap to new COW file '%s' failed, "
+                          "err = %d\n", cow_file, -err);
+               err = -EINVAL;
+               goto out;
+       }
+
+       return(0);
+
+ out:
+       return(err);
+}
+
+/*
+ * ---------------------------------------------------------------------------
+ * Local variables:
+ * c-file-style: "linux"
+ * End:
+ */
diff --git a/arch/um/include/irq_kern.h b/arch/um/include/irq_kern.h
new file mode 100644 (file)
index 0000000..4bcb829
--- /dev/null
@@ -0,0 +1,28 @@
+/* 
+ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
+ * Licensed under the GPL
+ */
+
+#ifndef __IRQ_KERN_H__
+#define __IRQ_KERN_H__
+
+#include "linux/interrupt.h"
+
+extern int um_request_irq(unsigned int irq, int fd, int type,
+                         irqreturn_t (*handler)(int, void *, 
+                                                struct pt_regs *),
+                         unsigned long irqflags,  const char * devname,
+                         void *dev_id);
+
+#endif
+
+/*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+ * adjust the settings for this buffer only.  This must remain at the end
+ * of the file.
+ * ---------------------------------------------------------------------------
+ * Local variables:
+ * c-file-style: "linux"
+ * End:
+ */
diff --git a/arch/um/include/mem_kern.h b/arch/um/include/mem_kern.h
new file mode 100644 (file)
index 0000000..b39f03d
--- /dev/null
@@ -0,0 +1,30 @@
+/* 
+ * Copyright (C) 2003 Jeff Dike (jdike@addtoit.com)
+ * Licensed under the GPL
+ */
+
+#ifndef __MEM_KERN_H__
+#define __MEM_KERN_H__
+
+#include "linux/list.h"
+#include "linux/types.h"
+
+struct remapper {
+       struct list_head list;
+       int (*proc)(int, unsigned long, int, __u64);
+};
+
+extern void register_remapper(struct remapper *info);
+
+#endif
+
+/*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+ * adjust the settings for this buffer only.  This must remain at the end
+ * of the file.
+ * ---------------------------------------------------------------------------
+ * Local variables:
+ * c-file-style: "linux"
+ * End:
+ */
diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c
new file mode 100644 (file)
index 0000000..d0e0f50
--- /dev/null
@@ -0,0 +1,468 @@
+/* 
+ * Copyright (C) 2000 - 2003 Jeff Dike (jdike@addtoit.com)
+ * Licensed under the GPL
+ */
+
+#include "linux/mm.h"
+#include "linux/ghash.h"
+#include "linux/slab.h"
+#include "linux/vmalloc.h"
+#include "linux/bootmem.h"
+#include "asm/types.h"
+#include "asm/pgtable.h"
+#include "kern_util.h"
+#include "user_util.h"
+#include "mode_kern.h"
+#include "mem.h"
+#include "mem_user.h"
+#include "os.h"
+#include "kern.h"
+#include "init.h"
+
+#if 0
+static pgd_t physmem_pgd[PTRS_PER_PGD];
+
+static struct phys_desc *lookup_mapping(void *addr)
+{
+       pgd = &physmem_pgd[pgd_index(addr)];
+       if(pgd_none(pgd))
+               return(NULL);
+
+       pmd = pmd_offset(pgd, addr);
+       if(pmd_none(pmd))
+               return(NULL);
+
+       pte = pte_offset_kernel(pmd, addr);
+       return((struct phys_desc *) pte_val(pte));
+}
+
+static struct add_mapping(void *addr, struct phys_desc *new)
+{
+}
+#endif
+
+#define PHYS_HASHSIZE (8192)
+
+struct phys_desc;
+
+DEF_HASH_STRUCTS(virtmem, PHYS_HASHSIZE, struct phys_desc);
+
+struct phys_desc {
+       struct virtmem_ptrs virt_ptrs;
+       int fd;
+       __u64 offset;
+       void *virt;
+       unsigned long phys;
+       struct list_head list;
+};
+
+struct virtmem_table virtmem_hash;
+
+static int virt_cmp(void *virt1, void *virt2)
+{
+       return(virt1 != virt2);
+}
+
+static int virt_hash(void *virt)
+{
+       unsigned long addr = ((unsigned long) virt) >> PAGE_SHIFT;
+       return(addr % PHYS_HASHSIZE);
+}
+
+DEF_HASH(static, virtmem, struct phys_desc, virt_ptrs, void *, virt, virt_cmp, 
+        virt_hash);
+
+LIST_HEAD(descriptor_mappings);
+
+struct desc_mapping {
+       int fd;
+       struct list_head list;
+       struct list_head pages;
+};
+
+static struct desc_mapping *find_mapping(int fd)
+{
+       struct desc_mapping *desc;
+       struct list_head *ele;
+
+       list_for_each(ele, &descriptor_mappings){
+               desc = list_entry(ele, struct desc_mapping, list);
+               if(desc->fd == fd)
+                       return(desc);
+       }
+
+       return(NULL);
+}
+
+static struct desc_mapping *descriptor_mapping(int fd)
+{
+       struct desc_mapping *desc;
+
+       desc = find_mapping(fd);
+       if(desc != NULL)
+               return(desc);
+
+       desc = kmalloc(sizeof(*desc), GFP_ATOMIC);
+       if(desc == NULL)
+               return(NULL);
+
+       *desc = ((struct desc_mapping) 
+               { .fd =         fd,
+                 .list =       LIST_HEAD_INIT(desc->list),
+                 .pages =      LIST_HEAD_INIT(desc->pages) });
+       list_add(&desc->list, &descriptor_mappings);
+
+       return(desc);
+}
+
+int physmem_subst_mapping(void *virt, int fd, __u64 offset, int w)
+{
+       struct desc_mapping *fd_maps;
+       struct phys_desc *desc;
+       unsigned long phys;
+       int err;
+
+       fd_maps = descriptor_mapping(fd);
+       if(fd_maps == NULL)
+               return(-ENOMEM);
+
+       phys = __pa(virt);
+       if(find_virtmem_hash(&virtmem_hash, virt) != NULL)
+               panic("Address 0x%p is already substituted\n", virt);
+
+       err = -ENOMEM;
+       desc = kmalloc(sizeof(*desc), GFP_ATOMIC);
+       if(desc == NULL)
+               goto out;
+
+       *desc = ((struct phys_desc) 
+               { .virt_ptrs =  { NULL, NULL },
+                 .fd =         fd,
+                 .offset =             offset,
+                 .virt =               virt,
+                 .phys =               __pa(virt),
+                 .list =               LIST_HEAD_INIT(desc->list) });
+       insert_virtmem_hash(&virtmem_hash, desc);
+
+       list_add(&desc->list, &fd_maps->pages);
+
+       virt = (void *) ((unsigned long) virt & PAGE_MASK);
+       err = os_map_memory(virt, fd, offset, PAGE_SIZE, 1, w, 0);
+       if(!err)
+               goto out;
+
+       remove_virtmem_hash(&virtmem_hash, desc);
+       kfree(desc);
+ out:
+       return(err);
+}
+
+static int physmem_fd = -1;
+
+static void remove_mapping(struct phys_desc *desc)
+{
+       void *virt = desc->virt;
+       int err;
+
+       remove_virtmem_hash(&virtmem_hash, desc);
+       list_del(&desc->list);
+       kfree(desc);
+
+       err = os_map_memory(virt, physmem_fd, __pa(virt), PAGE_SIZE, 1, 1, 0);
+       if(err)
+               panic("Failed to unmap block device page from physical memory, "
+                     "errno = %d", -err);
+}
+
+int physmem_remove_mapping(void *virt)
+{
+       struct phys_desc *desc;
+
+       virt = (void *) ((unsigned long) virt & PAGE_MASK);
+       desc = find_virtmem_hash(&virtmem_hash, virt);
+       if(desc == NULL)
+               return(0);
+
+       remove_mapping(desc);
+       return(1);
+}
+
+void physmem_forget_descriptor(int fd)
+{
+       struct desc_mapping *desc;
+       struct phys_desc *page;
+       struct list_head *ele, *next;
+       __u64 offset;
+       void *addr;
+       int err;
+
+       desc = find_mapping(fd);
+       if(desc == NULL)
+               return;
+
+       list_for_each_safe(ele, next, &desc->pages){
+               page = list_entry(ele, struct phys_desc, list);
+               offset = page->offset;
+               addr = page->virt;
+               remove_mapping(page);
+               err = os_seek_file(fd, offset);
+               if(err)
+                       panic("physmem_forget_descriptor - failed to seek "
+                             "to %lld in fd %d, error = %d\n",
+                             offset, fd, -err);
+               err = os_read_file(fd, addr, PAGE_SIZE);
+               if(err < 0)
+                       panic("physmem_forget_descriptor - failed to read "
+                             "from fd %d to 0x%p, error = %d\n",
+                             fd, addr, -err);
+       }
+
+       list_del(&desc->list);
+       kfree(desc);
+}
+
+void arch_free_page(struct page *page, int order)
+{
+       void *virt;
+       int i;
+
+       for(i = 0; i < (1 << order); i++){
+               virt = __va(page_to_phys(page + i));
+               physmem_remove_mapping(virt);
+       }
+}
+
+int is_remapped(void *virt)
+{
+       return(find_virtmem_hash(&virtmem_hash, virt) != NULL);
+}
+
+/* Changed during early boot */
+unsigned long high_physmem;
+
+extern unsigned long physmem_size;
+
+void *to_virt(unsigned long phys)
+{
+       return((void *) uml_physmem + phys);
+}
+
+unsigned long to_phys(void *virt)
+{
+       return(((unsigned long) virt) - uml_physmem);
+}
+
+int init_maps(unsigned long physmem, unsigned long iomem, unsigned long highmem)
+{
+       struct page *p, *map;
+       unsigned long phys_len, phys_pages, highmem_len, highmem_pages;
+       unsigned long iomem_len, iomem_pages, total_len, total_pages;
+       int i;
+
+       phys_pages = physmem >> PAGE_SHIFT;
+       phys_len = phys_pages * sizeof(struct page);
+
+       iomem_pages = iomem >> PAGE_SHIFT;
+       iomem_len = iomem_pages * sizeof(struct page);
+
+       highmem_pages = highmem >> PAGE_SHIFT;
+       highmem_len = highmem_pages * sizeof(struct page);
+
+       total_pages = phys_pages + iomem_pages + highmem_pages;
+       total_len = phys_len + iomem_pages + highmem_len;
+
+       if(kmalloc_ok){
+               map = kmalloc(total_len, GFP_KERNEL);
+               if(map == NULL) 
+                       map = vmalloc(total_len);
+       }
+       else map = alloc_bootmem_low_pages(total_len);
+
+       if(map == NULL)
+               return(-ENOMEM);
+
+       for(i = 0; i < total_pages; i++){
+               p = &map[i];
+               set_page_count(p, 0);
+               SetPageReserved(p);
+               INIT_LIST_HEAD(&p->lru);
+       }
+
+       mem_map = map;
+       max_mapnr = total_pages;
+       return(0);
+}
+
+struct page *phys_to_page(const unsigned long phys)
+{
+       return(&mem_map[phys >> PAGE_SHIFT]);
+}
+
+struct page *__virt_to_page(const unsigned long virt)
+{
+       return(&mem_map[__pa(virt) >> PAGE_SHIFT]);
+}
+
+unsigned long page_to_phys(struct page *page)
+{
+       return((page - mem_map) << PAGE_SHIFT);
+}
+
+pte_t mk_pte(struct page *page, pgprot_t pgprot)
+{
+       pte_t pte;
+
+       pte_val(pte) = page_to_phys(page) + pgprot_val(pgprot);
+       if(pte_present(pte)) pte_mknewprot(pte_mknewpage(pte));
+       return(pte);
+}
+
+/* Changed during early boot */
+static unsigned long kmem_top = 0;
+
+unsigned long get_kmem_end(void)
+{
+       if(kmem_top == 0) 
+               kmem_top = CHOOSE_MODE(kmem_end_tt, kmem_end_skas);
+       return(kmem_top);
+}
+
+void map_memory(unsigned long virt, unsigned long phys, unsigned long len, 
+               int r, int w, int x)
+{
+       __u64 offset;
+       int fd, err;
+
+       fd = phys_mapping(phys, &offset);
+       err = os_map_memory((void *) virt, fd, offset, len, r, w, x);
+       if(err)
+               panic("map_memory(0x%lx, %d, 0x%llx, %ld, %d, %d, %d) failed, "
+                     "err = %d\n", virt, fd, offset, len, r, w, x, err);
+}
+
+#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
+
+void setup_physmem(unsigned long start, unsigned long reserve_end,
+                  unsigned long len, unsigned long highmem)
+{
+       unsigned long reserve = reserve_end - start;
+       int pfn = PFN_UP(__pa(reserve_end));
+       int delta = (len - reserve) >> PAGE_SHIFT;
+       int err, offset, bootmap_size;
+
+       physmem_fd = create_mem_file(len + highmem);
+
+       offset = uml_reserved - uml_physmem;
+       err = os_map_memory((void *) uml_reserved, physmem_fd, offset, 
+                           len - offset, 1, 1, 0);
+       if(err < 0){
+               os_print_error(err, "Mapping memory");
+               exit(1);
+       }
+
+       bootmap_size = init_bootmem(pfn, pfn + delta);
+       free_bootmem(__pa(reserve_end) + bootmap_size,
+                    len - bootmap_size - reserve);
+}
+
+int phys_mapping(unsigned long phys, __u64 *offset_out)
+{
+       struct phys_desc *desc = find_virtmem_hash(&virtmem_hash, 
+                                                  __va(phys & PAGE_MASK));
+       int fd = -1;
+
+       if(desc != NULL){
+               fd = desc->fd;
+               *offset_out = desc->offset;
+       }
+       else if(phys < physmem_size){
+               fd = physmem_fd;
+               *offset_out = phys;
+       }
+       else if(phys < __pa(end_iomem)){
+               struct iomem_region *region = iomem_regions;
+       
+               while(region != NULL){
+                       if((phys >= region->phys) && 
+                          (phys < region->phys + region->size)){
+                               fd = region->fd;
+                               *offset_out = phys - region->phys;
+                               break;
+                       }
+                       region = region->next;
+               }
+       }
+       else if(phys < __pa(end_iomem) + highmem){
+               fd = physmem_fd;
+               *offset_out = phys - iomem_size;
+       }
+
+       return(fd);
+}
+
+static int __init uml_mem_setup(char *line, int *add)
+{
+       char *retptr;
+       physmem_size = memparse(line,&retptr);
+       return 0;
+}
+__uml_setup("mem=", uml_mem_setup,
+"mem=<Amount of desired ram>\n"
+"    This controls how much \"physical\" memory the kernel allocates\n"
+"    for the system. The size is specified as a number followed by\n"
+"    one of 'k', 'K', 'm', 'M', which have the obvious meanings.\n"
+"    This is not related to the amount of memory in the host.  It can\n"
+"    be more, and the excess, if it's ever used, will just be swapped out.\n"
+"      Example: mem=64M\n\n"
+);
+
+unsigned long find_iomem(char *driver, unsigned long *len_out)
+{
+       struct iomem_region *region = iomem_regions;
+       
+       while(region != NULL){
+               if(!strcmp(region->driver, driver)){
+                       *len_out = region->size;
+                       return(region->virt);
+               }
+       }
+
+       return(0);
+}
+
+int setup_iomem(void)
+{
+       struct iomem_region *region = iomem_regions;
+       unsigned long iomem_start = high_physmem + PAGE_SIZE;
+       int err;
+
+       while(region != NULL){
+               err = os_map_memory((void *) iomem_start, region->fd, 0, 
+                                   region->size, 1, 1, 0);
+               if(err)
+                       printk("Mapping iomem region for driver '%s' failed, "
+                              "errno = %d\n", region->driver, -err);
+               else {
+                       region->virt = iomem_start;
+                       region->phys = __pa(region->virt);
+               }
+
+               iomem_start += region->size + PAGE_SIZE;
+               region = region->next;
+       }
+
+       return(0);
+}
+
+__initcall(setup_iomem);
+
+/*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+ * adjust the settings for this buffer only.  This must remain at the end
+ * of the file.
+ * ---------------------------------------------------------------------------
+ * Local variables:
+ * c-file-style: "linux"
+ * End:
+ */
diff --git a/arch/um/kernel/skas/uaccess.c b/arch/um/kernel/skas/uaccess.c
new file mode 100644 (file)
index 0000000..ea82f19
--- /dev/null
@@ -0,0 +1,219 @@
+/* 
+ * Copyright (C) 2002 - 2003 Jeff Dike (jdike@addtoit.com)
+ * Licensed under the GPL
+ */
+
+#include "linux/stddef.h"
+#include "linux/kernel.h"
+#include "linux/string.h"
+#include "linux/fs.h"
+#include "linux/highmem.h"
+#include "asm/page.h"
+#include "asm/pgtable.h"
+#include "asm/uaccess.h"
+#include "kern_util.h"
+
+extern void *um_virt_to_phys(struct task_struct *task, unsigned long addr, 
+                            pte_t *pte_out);
+
+static unsigned long maybe_map(unsigned long virt, int is_write)
+{
+       pte_t pte;
+       int err;
+
+       void *phys = um_virt_to_phys(current, virt, &pte);
+       int dummy_code;
+
+       if(IS_ERR(phys) || (is_write && !pte_write(pte))){
+               err = handle_page_fault(virt, 0, is_write, 0, &dummy_code);
+               if(err)
+                       return(0);
+               phys = um_virt_to_phys(current, virt, NULL);
+       }
+       return((unsigned long) phys);
+}
+
+static int do_op(unsigned long addr, int len, int is_write, 
+                int (*op)(unsigned long addr, int len, void *arg), void *arg)
+{
+       struct page *page;
+       int n;
+
+       addr = maybe_map(addr, is_write);
+       if(addr == -1)
+               return(-1);
+
+       page = phys_to_page(addr);
+       addr = (unsigned long) kmap(page) + (addr & ~PAGE_MASK);
+       n = (*op)(addr, len, arg);
+       kunmap(page);
+
+       return(n);
+}
+
+static int buffer_op(unsigned long addr, int len, int is_write,
+                    int (*op)(unsigned long addr, int len, void *arg),
+                    void *arg)
+{
+       int size = min(PAGE_ALIGN(addr) - addr, (unsigned long) len);
+       int remain = len, n;
+
+       n = do_op(addr, size, is_write, op, arg);
+       if(n != 0)
+               return(n < 0 ? remain : 0);
+
+       addr += size;
+       remain -= size;
+       if(remain == 0) 
+               return(0);
+
+       while(addr < ((addr + remain) & PAGE_MASK)){
+               n = do_op(addr, PAGE_SIZE, is_write, op, arg);
+               if(n != 0)
+                       return(n < 0 ? remain : 0);
+
+               addr += PAGE_SIZE;
+               remain -= PAGE_SIZE;
+       }
+       if(remain == 0)
+               return(0);
+
+       n = do_op(addr, remain, is_write, op, arg);
+       if(n != 0)
+               return(n < 0 ? remain : 0);
+       return(0);
+}
+
+static int copy_chunk_from_user(unsigned long from, int len, void *arg)
+{
+       unsigned long *to_ptr = arg, to = *to_ptr;
+
+       memcpy((void *) to, (void *) from, len);
+       *to_ptr += len;
+       return(0);
+}
+
+int copy_from_user_skas(void *to, const void *from, int n)
+{
+       if(segment_eq(get_fs(), KERNEL_DS)){
+               memcpy(to, from, n);
+               return(0);
+       }
+
+       return(access_ok_skas(VERIFY_READ, from, n) ?
+              buffer_op((unsigned long) from, n, 0, copy_chunk_from_user, &to):
+              n);
+}
+
+static int copy_chunk_to_user(unsigned long to, int len, void *arg)
+{
+       unsigned long *from_ptr = arg, from = *from_ptr;
+
+       memcpy((void *) to, (void *) from, len);
+       *from_ptr += len;
+       return(0);
+}
+
+int copy_to_user_skas(void *to, const void *from, int n)
+{
+       if(segment_eq(get_fs(), KERNEL_DS)){
+               memcpy(to, from, n);
+               return(0);
+       }
+
+       return(access_ok_skas(VERIFY_WRITE, to, n) ?
+              buffer_op((unsigned long) to, n, 1, copy_chunk_to_user, &from) :
+              n);
+}
+
+static int strncpy_chunk_from_user(unsigned long from, int len, void *arg)
+{
+       char **to_ptr = arg, *to = *to_ptr;
+       int n;
+
+       strncpy(to, (void *) from, len);
+       n = strnlen(to, len);
+       *to_ptr += n;
+
+       if(n < len) 
+               return(1);
+       return(0);
+}
+
+int strncpy_from_user_skas(char *dst, const char *src, int count)
+{
+       int n;
+       char *ptr = dst;
+
+       if(segment_eq(get_fs(), KERNEL_DS)){
+               strncpy(dst, src, count);
+               return(strnlen(dst, count));
+       }
+
+       if(!access_ok_skas(VERIFY_READ, src, 1))
+               return(-EFAULT);
+
+       n = buffer_op((unsigned long) src, count, 0, strncpy_chunk_from_user, 
+                     &ptr);
+       if(n != 0)
+               return(-EFAULT);
+       return(strnlen(dst, count));
+}
+
+static int clear_chunk(unsigned long addr, int len, void *unused)
+{
+       memset((void *) addr, 0, len);
+       return(0);
+}
+
+int __clear_user_skas(void *mem, int len)
+{
+       return(buffer_op((unsigned long) mem, len, 1, clear_chunk, NULL));
+}
+
+int clear_user_skas(void *mem, int len)
+{
+       if(segment_eq(get_fs(), KERNEL_DS)){
+               memset(mem, 0, len);
+               return(0);
+       }
+
+       return(access_ok_skas(VERIFY_WRITE, mem, len) ? 
+              buffer_op((unsigned long) mem, len, 1, clear_chunk, NULL) : len);
+}
+
+static int strnlen_chunk(unsigned long str, int len, void *arg)
+{
+       int *len_ptr = arg, n;
+
+       n = strnlen((void *) str, len);
+       *len_ptr += n;
+
+       if(n < len)
+               return(1);
+       return(0);
+}
+
+int strnlen_user_skas(const void *str, int len)
+{
+       int count = 0, n;
+
+       if(segment_eq(get_fs(), KERNEL_DS))
+               return(strnlen(str, len) + 1);
+
+       n = buffer_op((unsigned long) str, len, 0, strnlen_chunk, &count);
+       if(n == 0)
+               return(count + 1);
+       return(-EFAULT);
+}
+
+/*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+ * adjust the settings for this buffer only.  This must remain at the end
+ * of the file.
+ * ---------------------------------------------------------------------------
+ * Local variables:
+ * c-file-style: "linux"
+ * End:
+ */
diff --git a/arch/um/kernel/tt/uaccess.c b/arch/um/kernel/tt/uaccess.c
new file mode 100644 (file)
index 0000000..9c84011
--- /dev/null
@@ -0,0 +1,73 @@
+/* 
+ * Copyright (C) 2000 - 2003 Jeff Dike (jdike@addtoit.com)
+ * Licensed under the GPL
+ */
+
+#include "linux/sched.h"
+#include "asm/uaccess.h"
+
+int copy_from_user_tt(void *to, const void *from, int n)
+{
+       if(!access_ok_tt(VERIFY_READ, from, n)) 
+               return(n);
+
+       return(__do_copy_from_user(to, from, n, &current->thread.fault_addr,
+                                  &current->thread.fault_catcher));
+}
+
+int copy_to_user_tt(void *to, const void *from, int n)
+{
+       if(!access_ok_tt(VERIFY_WRITE, to, n))
+               return(n);
+               
+       return(__do_copy_to_user(to, from, n, &current->thread.fault_addr,
+                                &current->thread.fault_catcher));
+}
+
+int strncpy_from_user_tt(char *dst, const char *src, int count)
+{
+       int n;
+
+       if(!access_ok_tt(VERIFY_READ, src, 1)) 
+               return(-EFAULT);
+
+       n = __do_strncpy_from_user(dst, src, count, 
+                                  &current->thread.fault_addr,
+                                  &current->thread.fault_catcher);
+       if(n < 0) return(-EFAULT);
+       return(n);
+}
+
+int __clear_user_tt(void *mem, int len)
+{
+       return(__do_clear_user(mem, len,
+                              &current->thread.fault_addr,
+                              &current->thread.fault_catcher));
+}
+
+int clear_user_tt(void *mem, int len)
+{
+       if(!access_ok_tt(VERIFY_WRITE, mem, len))
+               return(len);
+
+       return(__do_clear_user(mem, len, &current->thread.fault_addr,
+                              &current->thread.fault_catcher));
+}
+
+int strnlen_user_tt(const void *str, int len)
+{
+       return(__do_strnlen_user(str, len,
+                                &current->thread.fault_addr,
+                                &current->thread.fault_catcher));
+}
+
+/*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+ * adjust the settings for this buffer only.  This must remain at the end
+ * of the file.
+ * ---------------------------------------------------------------------------
+ * Local variables:
+ * c-file-style: "linux"
+ * End:
+ */
diff --git a/arch/um/os-Linux/user_syms.c b/arch/um/os-Linux/user_syms.c
new file mode 100644 (file)
index 0000000..ef0fb71
--- /dev/null
@@ -0,0 +1,88 @@
+#include "linux/types.h"
+#include "linux/module.h"
+
+/* Some of this are builtin function (some are not but could in the future),
+ * so I *must* declare good prototypes for them and then EXPORT them.
+ * The kernel code uses the macro defined by include/linux/string.h,
+ * so I undef macros; the userspace code does not include that and I
+ * add an EXPORT for the glibc one.*/
+
+#undef strlen
+#undef strstr
+#undef memcpy
+#undef memset
+
+extern size_t strlen(const char *);
+extern void *memcpy(void *, const void *, size_t);
+extern void *memset(void *, int, size_t);
+extern int printf(const char *, ...);
+
+EXPORT_SYMBOL(strlen);
+EXPORT_SYMBOL(memcpy);
+EXPORT_SYMBOL(memset);
+EXPORT_SYMBOL(printf);
+
+EXPORT_SYMBOL(strstr);
+
+/* Here, instead, I can provide a fake prototype. Yes, someone cares: genksyms.
+ * However, the modules will use the CRC defined *here*, no matter if it is 
+ * good; so the versions of these symbols will always match
+ */
+#define EXPORT_SYMBOL_PROTO(sym)       \
+       int sym(void);                  \
+       EXPORT_SYMBOL(sym);
+
+EXPORT_SYMBOL_PROTO(__errno_location);
+
+EXPORT_SYMBOL_PROTO(access);
+EXPORT_SYMBOL_PROTO(open);
+EXPORT_SYMBOL_PROTO(open64);
+EXPORT_SYMBOL_PROTO(close);
+EXPORT_SYMBOL_PROTO(read);
+EXPORT_SYMBOL_PROTO(write);
+EXPORT_SYMBOL_PROTO(dup2);
+EXPORT_SYMBOL_PROTO(__xstat);
+EXPORT_SYMBOL_PROTO(__lxstat);
+EXPORT_SYMBOL_PROTO(__lxstat64);
+EXPORT_SYMBOL_PROTO(lseek);
+EXPORT_SYMBOL_PROTO(lseek64);
+EXPORT_SYMBOL_PROTO(chown);
+EXPORT_SYMBOL_PROTO(truncate);
+EXPORT_SYMBOL_PROTO(utime);
+EXPORT_SYMBOL_PROTO(chmod);
+EXPORT_SYMBOL_PROTO(rename);
+EXPORT_SYMBOL_PROTO(__xmknod);
+
+EXPORT_SYMBOL_PROTO(symlink);
+EXPORT_SYMBOL_PROTO(link);
+EXPORT_SYMBOL_PROTO(unlink);
+EXPORT_SYMBOL_PROTO(readlink);
+
+EXPORT_SYMBOL_PROTO(mkdir);
+EXPORT_SYMBOL_PROTO(rmdir);
+EXPORT_SYMBOL_PROTO(opendir);
+EXPORT_SYMBOL_PROTO(readdir);
+EXPORT_SYMBOL_PROTO(closedir);
+EXPORT_SYMBOL_PROTO(seekdir);
+EXPORT_SYMBOL_PROTO(telldir);
+
+EXPORT_SYMBOL_PROTO(ioctl);
+
+EXPORT_SYMBOL_PROTO(pread64);
+EXPORT_SYMBOL_PROTO(pwrite64);
+
+EXPORT_SYMBOL_PROTO(statfs);
+EXPORT_SYMBOL_PROTO(statfs64);
+
+EXPORT_SYMBOL_PROTO(getuid);
+
+/*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+ * adjust the settings for this buffer only.  This must remain at the end
+ * of the file.
+ * ---------------------------------------------------------------------------
+ * Local variables:
+ * c-file-style: "linux"
+ * End:
+ */
diff --git a/fs/hostfs/Makefile b/fs/hostfs/Makefile
new file mode 100644 (file)
index 0000000..794292e
--- /dev/null
@@ -0,0 +1,26 @@
+# 
+# Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
+# Licensed under the GPL
+#
+
+# struct stat64 changed the inode field name between 2.2 and 2.4 from st_ino
+# to __st_ino.  It stayed in the same place, so as long as the correct name
+# is used, hostfs compiled on 2.2 should work on 2.4 and vice versa.
+
+STAT64_INO_FIELD := $(shell grep -q __st_ino /usr/include/bits/stat.h && \
+                               echo __)st_ino
+
+hostfs-objs := hostfs_kern.o hostfs_user.o
+
+obj-y = 
+obj-$(CONFIG_HOSTFS) += hostfs.o
+
+SINGLE_OBJS = $(foreach f,$(patsubst %.o,%,$(obj-y) $(obj-m)),$($(f)-objs))
+
+USER_OBJS := $(filter %_user.o,$(obj-y) $(obj-m) $(SINGLE_OBJS))
+USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file))
+
+USER_CFLAGS += -DSTAT64_INO_FIELD=$(STAT64_INO_FIELD)
+
+$(USER_OBJS) : %.o: %.c
+       $(CC) $(CFLAGS_$(notdir $@)) $(USER_CFLAGS) -c -o $@ $<
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
new file mode 100644 (file)
index 0000000..d1f6c33
--- /dev/null
@@ -0,0 +1,79 @@
+#ifndef __UM_FS_HOSTFS
+#define __UM_FS_HOSTFS
+
+#include "os.h"
+
+/* These are exactly the same definitions as in fs.h, but the names are 
+ * changed so that this file can be included in both kernel and user files.
+ */
+
+#define HOSTFS_ATTR_MODE       1
+#define HOSTFS_ATTR_UID        2
+#define HOSTFS_ATTR_GID        4
+#define HOSTFS_ATTR_SIZE       8
+#define HOSTFS_ATTR_ATIME      16
+#define HOSTFS_ATTR_MTIME      32
+#define HOSTFS_ATTR_CTIME      64
+#define HOSTFS_ATTR_ATIME_SET  128
+#define HOSTFS_ATTR_MTIME_SET  256
+#define HOSTFS_ATTR_FORCE      512     /* Not a change, but a change it */
+#define HOSTFS_ATTR_ATTR_FLAG  1024
+
+struct hostfs_iattr {
+       unsigned int    ia_valid;
+       mode_t          ia_mode;
+       uid_t           ia_uid;
+       gid_t           ia_gid;
+       loff_t          ia_size;
+       struct timespec ia_atime;
+       struct timespec ia_mtime;
+       struct timespec ia_ctime;
+       unsigned int    ia_attr_flags;
+};
+
+extern int stat_file(const char *path, unsigned long long *inode_out, 
+                    int *mode_out, int *nlink_out, int *uid_out, int *gid_out,
+                    unsigned long long *size_out, struct timespec *atime_out, 
+                    struct timespec *mtime_out, struct timespec *ctime_out, 
+                    int *blksize_out, unsigned long long *blocks_out);
+extern int access_file(char *path, int r, int w, int x);
+extern int open_file(char *path, int r, int w, int append);
+extern int file_type(const char *path, int *rdev);
+extern void *open_dir(char *path, int *err_out);
+extern char *read_dir(void *stream, unsigned long long *pos, 
+                     unsigned long long *ino_out, int *len_out);
+extern void close_file(void *stream);
+extern void close_dir(void *stream);
+extern int read_file(int fd, unsigned long long *offset, char *buf, int len);
+extern int write_file(int fd, unsigned long long *offset, const char *buf,
+                     int len);
+extern int lseek_file(int fd, long long offset, int whence);
+extern int file_create(char *name, int ur, int uw, int ux, int gr, 
+                      int gw, int gx, int or, int ow, int ox);
+extern int set_attr(const char *file, struct hostfs_iattr *attrs);
+extern int make_symlink(const char *from, const char *to);
+extern int unlink_file(const char *file);
+extern int do_mkdir(const char *file, int mode);
+extern int do_rmdir(const char *file);
+extern int do_mknod(const char *file, int mode, int dev);
+extern int link_file(const char *from, const char *to);
+extern int do_readlink(char *file, char *buf, int size);
+extern int rename_file(char *from, char *to);
+extern int do_statfs(char *root, long *bsize_out, long long *blocks_out, 
+                    long long *bfree_out, long long *bavail_out, 
+                    long long *files_out, long long *ffree_out, 
+                    void *fsid_out, int fsid_size, long *namelen_out, 
+                    long *spare_out);
+
+#endif
+
+/*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+ * adjust the settings for this buffer only.  This must remain at the end
+ * of the file.
+ * ---------------------------------------------------------------------------
+ * Local variables:
+ * c-file-style: "linux"
+ * End:
+ */
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
new file mode 100644 (file)
index 0000000..ef5d5d1
--- /dev/null
@@ -0,0 +1,1008 @@
+/* 
+ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
+ * Licensed under the GPL
+ *
+ * Ported the filesystem routines to 2.5.
+ * 2003-02-10 Petr Baudis <pasky@ucw.cz>
+ */
+
+#include <linux/stddef.h>
+#include <linux/fs.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/blkdev.h>
+#include <linux/list.h>
+#include <linux/buffer_head.h>
+#include <linux/root_dev.h>
+#include <linux/statfs.h>
+#include <asm/uaccess.h>
+#include "hostfs.h"
+#include "kern_util.h"
+#include "kern.h"
+#include "user_util.h"
+#include "2_5compat.h"
+#include "init.h"
+
+struct hostfs_inode_info {
+       char *host_filename;
+       int fd;
+       int mode;
+       struct inode vfs_inode;
+};
+
+static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode)
+{
+       return(list_entry(inode, struct hostfs_inode_info, vfs_inode));
+}
+
+#define FILE_HOSTFS_I(file) HOSTFS_I((file)->f_dentry->d_inode)
+
+int hostfs_d_delete(struct dentry *dentry)
+{
+       return(1);
+}
+
+struct dentry_operations hostfs_dentry_ops = {
+       .d_delete               = hostfs_d_delete,
+};
+
+/* Changed in hostfs_args before the kernel starts running */
+static char *root_ino = "/";
+static int append = 0;
+
+#define HOSTFS_SUPER_MAGIC 0x00c0ffee
+
+static struct inode_operations hostfs_iops;
+static struct inode_operations hostfs_dir_iops;
+static struct address_space_operations hostfs_link_aops;
+
+static int __init hostfs_args(char *options, int *add)
+{
+       char *ptr;
+
+       ptr = strchr(options, ',');
+       if(ptr != NULL)
+               *ptr++ = '\0';
+       if(*options != '\0')
+               root_ino = options;
+
+       options = ptr;
+       while(options){
+               ptr = strchr(options, ',');
+               if(ptr != NULL)
+                       *ptr++ = '\0';
+               if(*options != '\0'){
+                       if(!strcmp(options, "append"))
+                               append = 1;
+                       else printf("hostfs_args - unsupported option - %s\n",
+                                   options);
+               }
+               options = ptr;
+       }
+       return(0);
+}
+
+__uml_setup("hostfs=", hostfs_args,
+"hostfs=<root dir>,<flags>,...\n"
+"    This is used to set hostfs parameters.  The root directory argument\n"
+"    is used to confine all hostfs mounts to within the specified directory\n"
+"    tree on the host.  If this isn't specified, then a user inside UML can\n"
+"    mount anything on the host that's accessible to the user that's running\n"
+"    it.\n"
+"    The only flag currently supported is 'append', which specifies that all\n"
+"    files opened by hostfs will be opened in append mode.\n\n"
+);
+
+static char *dentry_name(struct dentry *dentry, int extra)
+{
+       struct dentry *parent;
+       char *root, *name;
+       int len;
+
+       len = 0;
+       parent = dentry;
+       while(parent->d_parent != parent){
+               len += parent->d_name.len + 1;
+               parent = parent->d_parent;
+       }
+       
+       root = HOSTFS_I(parent->d_inode)->host_filename;
+       len += strlen(root);
+       name = kmalloc(len + extra + 1, GFP_KERNEL);
+       if(name == NULL) return(NULL);
+
+       name[len] = '\0';
+       parent = dentry;
+       while(parent->d_parent != parent){
+               len -= parent->d_name.len + 1;
+               name[len] = '/';
+               strncpy(&name[len + 1], parent->d_name.name, 
+                       parent->d_name.len);
+               parent = parent->d_parent;
+       }
+       strncpy(name, root, strlen(root));
+       return(name);
+}
+
+static char *inode_name(struct inode *ino, int extra)
+{
+       struct dentry *dentry;
+
+       dentry = list_entry(ino->i_dentry.next, struct dentry, d_alias);
+       return(dentry_name(dentry, extra));
+}
+
+static int read_name(struct inode *ino, char *name)
+{
+       /* The non-int inode fields are copied into ints by stat_file and
+        * then copied into the inode because passing the actual pointers
+        * in and having them treated as int * breaks on big-endian machines
+        */
+       int err;
+       int i_mode, i_nlink, i_blksize;
+       unsigned long long i_size;
+       unsigned long long i_ino;
+       unsigned long long i_blocks;
+
+       err = stat_file(name, &i_ino, &i_mode, &i_nlink, &ino->i_uid, 
+                       &ino->i_gid, &i_size, &ino->i_atime, &ino->i_mtime, 
+                       &ino->i_ctime, &i_blksize, &i_blocks);
+       if(err) 
+               return(err);
+
+       ino->i_ino = i_ino;
+       ino->i_mode = i_mode;
+       ino->i_nlink = i_nlink;
+       ino->i_size = i_size;
+       ino->i_blksize = i_blksize;
+       ino->i_blocks = i_blocks;
+       if((ino->i_sb->s_dev == ROOT_DEV) && (ino->i_uid == getuid()))
+               ino->i_uid = 0;
+       return(0);
+}
+
+static char *follow_link(char *link)
+{
+       int len, n;
+       char *name, *resolved, *end;
+
+       len = 64;
+       while(1){
+               n = -ENOMEM;
+               name = kmalloc(len, GFP_KERNEL);
+               if(name == NULL)
+                       goto out;
+
+               n = do_readlink(link, name, len);
+               if(n < len)
+                       break;
+               len *= 2;
+               kfree(name);
+       }
+       if(n < 0)
+               goto out_free;
+
+       if(*name == '/')
+               return(name);
+
+       end = strrchr(link, '/');
+       if(end == NULL)
+               return(name);
+
+       *(end + 1) = '\0';
+       len = strlen(link) + strlen(name) + 1;
+
+       resolved = kmalloc(len, GFP_KERNEL);
+       if(resolved == NULL){
+               n = -ENOMEM;
+               goto out_free;
+       }
+
+       sprintf(resolved, "%s%s", link, name);
+       kfree(name);
+       kfree(link);
+       return(resolved);
+
+ out_free:
+       kfree(name);
+ out:
+       return(ERR_PTR(n));
+}
+
+static int read_inode(struct inode *ino)
+{
+       char *name;
+       int err = 0;
+
+       /* Unfortunately, we are called from iget() when we don't have a dentry
+        * allocated yet.
+        */
+       if(list_empty(&ino->i_dentry))
+               goto out;
+       err = -ENOMEM;
+       name = inode_name(ino, 0);
+       if(name == NULL) 
+               goto out;
+
+       if(file_type(name, NULL) == OS_TYPE_SYMLINK){
+               name = follow_link(name);
+               if(IS_ERR(name)){
+                       err = PTR_ERR(name);
+                       goto out;
+               }
+       }
+       
+       err = read_name(ino, name);
+       kfree(name);
+ out:
+       return(err);
+}
+
+int hostfs_statfs(struct super_block *sb, struct kstatfs *sf)
+{
+       /* do_statfs uses struct statfs64 internally, but the linux kernel
+        * struct statfs still has 32-bit versions for most of these fields,
+        * so we convert them here
+        */
+       int err;
+       long long f_blocks;
+       long long f_bfree;
+       long long f_bavail;
+       long long f_files;
+       long long f_ffree;
+
+       err = do_statfs(HOSTFS_I(sb->s_root->d_inode)->host_filename,
+                       &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files,
+                       &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid), 
+                       &sf->f_namelen, sf->f_spare);
+       if(err) return(err);
+       sf->f_blocks = f_blocks;
+       sf->f_bfree = f_bfree;
+       sf->f_bavail = f_bavail;
+       sf->f_files = f_files;
+       sf->f_ffree = f_ffree;
+       sf->f_type = HOSTFS_SUPER_MAGIC;
+       return(0);
+}
+
+static struct inode *hostfs_alloc_inode(struct super_block *sb)
+{
+       struct hostfs_inode_info *hi;
+
+       hi = kmalloc(sizeof(*hi), GFP_KERNEL);
+       if(hi == NULL) 
+               return(NULL);
+
+       *hi = ((struct hostfs_inode_info) { .host_filename      = NULL,
+                                           .fd                 = -1,
+                                           .mode               = 0 });
+       inode_init_once(&hi->vfs_inode);
+       return(&hi->vfs_inode);
+}
+
+static void hostfs_destroy_inode(struct inode *inode)
+{
+       if(HOSTFS_I(inode)->host_filename) 
+               kfree(HOSTFS_I(inode)->host_filename);
+
+       if(HOSTFS_I(inode)->fd != -1) 
+               close_file(&HOSTFS_I(inode)->fd);
+
+       kfree(HOSTFS_I(inode));
+}
+
+static void hostfs_read_inode(struct inode *inode)
+{
+       read_inode(inode);
+}
+
+static struct super_operations hostfs_sbops = { 
+       .alloc_inode    = hostfs_alloc_inode,
+       .destroy_inode  = hostfs_destroy_inode,
+       .read_inode     = hostfs_read_inode,
+       .statfs         = hostfs_statfs,
+};
+
+int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
+{
+       void *dir;
+       char *name;
+       unsigned long long next, ino;
+       int error, len;
+
+       name = dentry_name(file->f_dentry, 0);
+       if(name == NULL) return(-ENOMEM);
+       dir = open_dir(name, &error);
+       kfree(name);
+       if(dir == NULL) return(-error);
+       next = file->f_pos;
+       while((name = read_dir(dir, &next, &ino, &len)) != NULL){
+               error = (*filldir)(ent, name, len, file->f_pos, 
+                                  ino, DT_UNKNOWN);
+               if(error) break;
+               file->f_pos = next;
+       }
+       close_dir(dir);
+       return(0);
+}
+
+int hostfs_file_open(struct inode *ino, struct file *file)
+{
+       char *name;
+       int mode = 0, r = 0, w = 0, fd;
+
+       mode = file->f_mode & (FMODE_READ | FMODE_WRITE);
+       if((mode & HOSTFS_I(ino)->mode) == mode)
+               return(0);
+
+       /* The file may already have been opened, but with the wrong access,
+        * so this resets things and reopens the file with the new access.
+        */
+       if(HOSTFS_I(ino)->fd != -1){
+               close_file(&HOSTFS_I(ino)->fd);
+               HOSTFS_I(ino)->fd = -1;
+       }
+
+       HOSTFS_I(ino)->mode |= mode;
+       if(HOSTFS_I(ino)->mode & FMODE_READ) 
+               r = 1;
+       if(HOSTFS_I(ino)->mode & FMODE_WRITE) 
+               w = 1;
+       if(w) 
+               r = 1;
+
+       name = dentry_name(file->f_dentry, 0);
+       if(name == NULL) 
+               return(-ENOMEM);
+
+       fd = open_file(name, r, w, append);
+       kfree(name);
+       if(fd < 0) return(fd);
+       FILE_HOSTFS_I(file)->fd = fd;
+
+       return(0);
+}
+
+int hostfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+       return(0);
+}
+
+static struct file_operations hostfs_file_fops = {
+       .llseek         = generic_file_llseek,
+       .read           = generic_file_read,
+       .write          = generic_file_write,
+       .mmap           = generic_file_mmap,
+       .open           = hostfs_file_open,
+       .release        = NULL,
+       .fsync          = hostfs_fsync,
+};
+
+static struct file_operations hostfs_dir_fops = {
+       .readdir        = hostfs_readdir,
+       .read           = generic_read_dir,
+};
+
+int hostfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+       struct address_space *mapping = page->mapping;
+       struct inode *inode = mapping->host;
+       char *buffer;
+       unsigned long long base;
+       int count = PAGE_CACHE_SIZE;
+       int end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+       int err;
+
+       if (page->index >= end_index)
+               count = inode->i_size & (PAGE_CACHE_SIZE-1);
+
+       buffer = kmap(page);
+       base = ((unsigned long long) page->index) << PAGE_CACHE_SHIFT;
+
+       err = write_file(HOSTFS_I(inode)->fd, &base, buffer, count);
+       if(err != count){
+               ClearPageUptodate(page);
+               goto out;
+       }
+
+       if (base > inode->i_size)
+               inode->i_size = base;
+
+       if (PageError(page))
+               ClearPageError(page);   
+       err = 0;
+
+ out:  
+       kunmap(page);
+
+       unlock_page(page);
+       return err; 
+}
+
+int hostfs_readpage(struct file *file, struct page *page)
+{
+       char *buffer;
+       long long start;
+       int err = 0;
+
+       start = (long long) page->index << PAGE_CACHE_SHIFT;
+       buffer = kmap(page);
+       err = read_file(FILE_HOSTFS_I(file)->fd, &start, buffer,
+                       PAGE_CACHE_SIZE);
+       if(err < 0) goto out;
+
+       memset(&buffer[err], 0, PAGE_CACHE_SIZE - err);
+
+       flush_dcache_page(page);
+       SetPageUptodate(page);
+       if (PageError(page)) ClearPageError(page);
+       err = 0;
+ out:
+       kunmap(page);
+       unlock_page(page);
+       return(err);
+}
+
+int hostfs_prepare_write(struct file *file, struct page *page, 
+                        unsigned int from, unsigned int to)
+{
+       char *buffer;
+       long long start, tmp;
+       int err;
+
+       start = (long long) page->index << PAGE_CACHE_SHIFT;
+       buffer = kmap(page);
+       if(from != 0){
+               tmp = start;
+               err = read_file(FILE_HOSTFS_I(file)->fd, &tmp, buffer,
+                               from);
+               if(err < 0) goto out;
+       }
+       if(to != PAGE_CACHE_SIZE){
+               start += to;
+               err = read_file(FILE_HOSTFS_I(file)->fd, &start, buffer + to,
+                               PAGE_CACHE_SIZE - to);
+               if(err < 0) goto out;           
+       }
+       err = 0;
+ out:
+       kunmap(page);
+       return(err);
+}
+
+int hostfs_commit_write(struct file *file, struct page *page, unsigned from,
+                unsigned to)
+{
+       struct address_space *mapping = page->mapping;
+       struct inode *inode = mapping->host;
+       char *buffer;
+       long long start;
+       int err = 0;
+
+       start = (long long) (page->index << PAGE_CACHE_SHIFT) + from;
+       buffer = kmap(page);
+       err = write_file(FILE_HOSTFS_I(file)->fd, &start, buffer + from, 
+                        to - from);
+       if(err > 0) err = 0;
+       if(!err && (start > inode->i_size))
+               inode->i_size = start;
+
+       kunmap(page);
+       return(err);
+}
+
+static struct address_space_operations hostfs_aops = {
+       .writepage      = hostfs_writepage,
+       .readpage       = hostfs_readpage,
+/*     .set_page_dirty = __set_page_dirty_nobuffers, */
+       .prepare_write  = hostfs_prepare_write,
+       .commit_write   = hostfs_commit_write
+};
+
+static int init_inode(struct inode *inode, struct dentry *dentry)
+{
+       char *name;
+       int type, err = -ENOMEM, rdev;
+
+       if(dentry){
+               name = dentry_name(dentry, 0);
+               if(name == NULL)
+                       goto out;
+               type = file_type(name, &rdev);
+               kfree(name);
+       }
+       else type = OS_TYPE_DIR;
+
+       err = 0;
+       if(type == OS_TYPE_SYMLINK)
+               inode->i_op = &page_symlink_inode_operations;
+       else if(type == OS_TYPE_DIR)
+               inode->i_op = &hostfs_dir_iops;
+       else inode->i_op = &hostfs_iops;
+
+       if(type == OS_TYPE_DIR) inode->i_fop = &hostfs_dir_fops;
+       else inode->i_fop = &hostfs_file_fops;
+
+       if(type == OS_TYPE_SYMLINK) 
+               inode->i_mapping->a_ops = &hostfs_link_aops;
+       else inode->i_mapping->a_ops = &hostfs_aops;
+
+       switch (type) {
+       case OS_TYPE_CHARDEV:
+               init_special_inode(inode, S_IFCHR, rdev);
+               break;
+       case OS_TYPE_BLOCKDEV:
+               init_special_inode(inode, S_IFBLK, rdev);
+               break;
+       case OS_TYPE_FIFO:
+               init_special_inode(inode, S_IFIFO, 0);
+               break;
+       case OS_TYPE_SOCK:
+               init_special_inode(inode, S_IFSOCK, 0);
+               break;
+       }
+ out:
+       return(err);
+}
+
+int hostfs_create(struct inode *dir, struct dentry *dentry, int mode, 
+                 struct nameidata *nd)
+{
+       struct inode *inode;
+       char *name;
+       int error, fd;
+
+       error = -ENOMEM;
+       inode = iget(dir->i_sb, 0);
+       if(inode == NULL) goto out;
+
+       error = init_inode(inode, dentry);
+       if(error) 
+               goto out_put;
+       
+       error = -ENOMEM;
+       name = dentry_name(dentry, 0);
+       if(name == NULL)
+               goto out_put;
+
+       fd = file_create(name, 
+                        mode & S_IRUSR, mode & S_IWUSR, mode & S_IXUSR, 
+                        mode & S_IRGRP, mode & S_IWGRP, mode & S_IXGRP, 
+                        mode & S_IROTH, mode & S_IWOTH, mode & S_IXOTH);
+       if(fd < 0) 
+               error = fd;
+       else error = read_name(inode, name);
+
+       kfree(name);
+       if(error)
+               goto out_put;
+
+       HOSTFS_I(inode)->fd = fd;
+       HOSTFS_I(inode)->mode = FMODE_READ | FMODE_WRITE;
+       d_instantiate(dentry, inode);
+       return(0);
+
+ out_put:
+       iput(inode);
+ out:
+       return(error);
+}
+
+struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry, 
+                            struct nameidata *nd)
+{
+       struct inode *inode;
+       char *name;
+       int err;
+
+       err = -ENOMEM;
+       inode = iget(ino->i_sb, 0);
+       if(inode == NULL) 
+               goto out;
+       err = init_inode(inode, dentry);
+       if(err) 
+               goto out_put;
+
+       err = -ENOMEM;
+       name = dentry_name(dentry, 0);
+       if(name == NULL)
+               goto out_put;
+
+       err = read_name(inode, name);
+       kfree(name);
+       if(err == -ENOENT){
+               iput(inode);
+               inode = NULL;
+       }
+       else if(err)
+               goto out_put;
+
+       d_add(dentry, inode);
+       dentry->d_op = &hostfs_dentry_ops;
+       return(NULL);
+
+ out_put:
+       iput(inode);
+ out:
+       return(ERR_PTR(err));
+}
+
+static char *inode_dentry_name(struct inode *ino, struct dentry *dentry)
+{
+        char *file;
+       int len;
+
+       file = inode_name(ino, dentry->d_name.len + 1);
+       if(file == NULL) return(NULL);
+        strcat(file, "/");
+       len = strlen(file);
+        strncat(file, dentry->d_name.name, dentry->d_name.len);
+       file[len + dentry->d_name.len] = '\0';
+        return(file);
+}
+
+int hostfs_link(struct dentry *to, struct inode *ino, struct dentry *from)
+{
+        char *from_name, *to_name;
+        int err;
+
+        if((from_name = inode_dentry_name(ino, from)) == NULL) 
+                return(-ENOMEM);
+        to_name = dentry_name(to, 0);
+       if(to_name == NULL){
+               kfree(from_name);
+               return(-ENOMEM);
+       }
+        err = link_file(to_name, from_name);
+        kfree(from_name);
+        kfree(to_name);
+        return(err);
+}
+
+int hostfs_unlink(struct inode *ino, struct dentry *dentry)
+{
+       char *file;
+       int err;
+
+       if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM);
+       if(append)
+               return(-EPERM);
+
+       err = unlink_file(file);
+       kfree(file);
+       return(err);
+}
+
+int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to)
+{
+       char *file;
+       int err;
+
+       if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM);
+       err = make_symlink(file, to);
+       kfree(file);
+       return(err);
+}
+
+int hostfs_mkdir(struct inode *ino, struct dentry *dentry, int mode)
+{
+       char *file;
+       int err;
+
+       if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM);
+       err = do_mkdir(file, mode);
+       kfree(file);
+       return(err);
+}
+
+int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
+{
+       char *file;
+       int err;
+
+       if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM);
+       err = do_rmdir(file);
+       kfree(file);
+       return(err);
+}
+
+int hostfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+{
+       struct inode *inode;
+       char *name;
+       int err = -ENOMEM;
+       inode = iget(dir->i_sb, 0);
+       if(inode == NULL) 
+               goto out;
+
+       err = init_inode(inode, dentry);
+       if(err) 
+               goto out_put;
+
+       err = -ENOMEM;
+       name = dentry_name(dentry, 0);
+       if(name == NULL)
+               goto out_put;
+
+       init_special_inode(inode, mode, dev);
+       err = do_mknod(name, mode, dev);
+       if(err)
+               goto out_free;
+
+       err = read_name(inode, name);
+       kfree(name);
+       if(err)
+               goto out_put;
+
+       d_instantiate(dentry, inode);
+       return(0);
+
+ out_free:
+       kfree(name);
+ out_put:
+       iput(inode);
+ out:
+       return(err);
+}
+
+int hostfs_rename(struct inode *from_ino, struct dentry *from,
+                 struct inode *to_ino, struct dentry *to)
+{
+       char *from_name, *to_name;
+       int err;
+
+       if((from_name = inode_dentry_name(from_ino, from)) == NULL)
+               return(-ENOMEM);
+       if((to_name = inode_dentry_name(to_ino, to)) == NULL){
+               kfree(from_name);
+               return(-ENOMEM);
+       }
+       err = rename_file(from_name, to_name);
+       kfree(from_name);
+       kfree(to_name);
+       return(err);
+}
+
+void hostfs_truncate(struct inode *ino)
+{
+       not_implemented();
+}
+
+int hostfs_permission(struct inode *ino, int desired, struct nameidata *nd)
+{
+       char *name;
+       int r = 0, w = 0, x = 0, err;
+
+       if(desired & MAY_READ) r = 1;
+       if(desired & MAY_WRITE) w = 1;
+       if(desired & MAY_EXEC) x = 1;
+       name = inode_name(ino, 0);
+       if(name == NULL) return(-ENOMEM);
+       err = access_file(name, r, w, x);
+       kfree(name);
+       if(!err) err = vfs_permission(ino, desired);
+       return(err);
+}
+
+int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+       struct hostfs_iattr attrs;
+       char *name;
+       int err;
+       
+       if(append) 
+               attr->ia_valid &= ~ATTR_SIZE;
+
+       attrs.ia_valid = 0;
+       if(attr->ia_valid & ATTR_MODE){
+               attrs.ia_valid |= HOSTFS_ATTR_MODE;
+               attrs.ia_mode = attr->ia_mode;
+       }
+       if(attr->ia_valid & ATTR_UID){
+               if((dentry->d_inode->i_sb->s_dev == ROOT_DEV) && 
+                  (attr->ia_uid == 0))
+                       attr->ia_uid = getuid();
+               attrs.ia_valid |= HOSTFS_ATTR_UID;
+               attrs.ia_uid = attr->ia_uid;
+       }
+       if(attr->ia_valid & ATTR_GID){
+               if((dentry->d_inode->i_sb->s_dev == ROOT_DEV) && 
+                  (attr->ia_gid == 0))
+                       attr->ia_gid = getuid();
+               attrs.ia_valid |= HOSTFS_ATTR_GID;
+               attrs.ia_gid = attr->ia_gid;
+       }
+       if(attr->ia_valid & ATTR_SIZE){
+               attrs.ia_valid |= HOSTFS_ATTR_SIZE;
+               attrs.ia_size = attr->ia_size;
+       }
+       if(attr->ia_valid & ATTR_ATIME){
+               attrs.ia_valid |= HOSTFS_ATTR_ATIME;
+               attrs.ia_atime = attr->ia_atime;
+       }
+       if(attr->ia_valid & ATTR_MTIME){
+               attrs.ia_valid |= HOSTFS_ATTR_MTIME;
+               attrs.ia_mtime = attr->ia_mtime;
+       }
+       if(attr->ia_valid & ATTR_CTIME){
+               attrs.ia_valid |= HOSTFS_ATTR_CTIME;
+               attrs.ia_ctime = attr->ia_ctime;
+       }
+       if(attr->ia_valid & ATTR_ATIME_SET){
+               attrs.ia_valid |= HOSTFS_ATTR_ATIME_SET;
+       }
+       if(attr->ia_valid & ATTR_MTIME_SET){
+               attrs.ia_valid |= HOSTFS_ATTR_MTIME_SET;
+       }
+       name = dentry_name(dentry, 0);
+       if(name == NULL) return(-ENOMEM);
+       err = set_attr(name, &attrs);
+       kfree(name);
+       if(err)
+               return(err);
+
+       return(inode_setattr(dentry->d_inode, attr));
+}
+
+int hostfs_getattr(struct vfsmount *mnt, struct dentry *dentry, 
+          struct kstat *stat)
+{
+       generic_fillattr(dentry->d_inode, stat);
+       return(0);
+}
+
+static struct inode_operations hostfs_iops = {
+       .create         = hostfs_create,
+       .link           = hostfs_link,
+       .unlink         = hostfs_unlink,
+       .symlink        = hostfs_symlink,
+       .mkdir          = hostfs_mkdir,
+       .rmdir          = hostfs_rmdir,
+       .mknod          = hostfs_mknod,
+       .rename         = hostfs_rename,
+       .truncate       = hostfs_truncate,
+       .permission     = hostfs_permission,
+       .setattr        = hostfs_setattr,
+       .getattr        = hostfs_getattr,
+};
+
+static struct inode_operations hostfs_dir_iops = {
+       .create         = hostfs_create,
+       .lookup         = hostfs_lookup,
+       .link           = hostfs_link,
+       .unlink         = hostfs_unlink,
+       .symlink        = hostfs_symlink,
+       .mkdir          = hostfs_mkdir,
+       .rmdir          = hostfs_rmdir,
+       .mknod          = hostfs_mknod,
+       .rename         = hostfs_rename,
+       .truncate       = hostfs_truncate,
+       .permission     = hostfs_permission,
+       .setattr        = hostfs_setattr,
+       .getattr        = hostfs_getattr,
+};
+
+int hostfs_link_readpage(struct file *file, struct page *page)
+{
+       char *buffer, *name;
+       long long start;
+       int err;
+
+       start = page->index << PAGE_CACHE_SHIFT;
+       buffer = kmap(page);
+       name = inode_name(page->mapping->host, 0);
+       if(name == NULL) return(-ENOMEM);
+       err = do_readlink(name, buffer, PAGE_CACHE_SIZE);
+       kfree(name);
+       if(err == PAGE_CACHE_SIZE)
+               err = -E2BIG;
+       else if(err > 0){
+               flush_dcache_page(page);
+               SetPageUptodate(page);
+               if (PageError(page)) ClearPageError(page);
+               err = 0;
+       }
+       kunmap(page);
+       unlock_page(page);
+       return(err);
+}
+
+static struct address_space_operations hostfs_link_aops = {
+       .readpage       = hostfs_link_readpage,
+};
+
+static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
+{
+       struct inode *root_inode;
+       char *name, *data = d;
+       int err;
+
+       sb->s_blocksize = 1024;
+       sb->s_blocksize_bits = 10;
+       sb->s_magic = HOSTFS_SUPER_MAGIC;
+       sb->s_op = &hostfs_sbops;
+
+       if((data == NULL) || (*data == '\0')) 
+               data = root_ino;
+
+       err = -ENOMEM;
+       name = kmalloc(strlen(data) + 1, GFP_KERNEL);
+       if(name == NULL) 
+               goto out;
+
+       strcpy(name, data);
+
+       root_inode = iget(sb, 0);
+       if(root_inode == NULL)
+               goto out_free;
+
+       err = init_inode(root_inode, NULL);
+       if(err)
+               goto out_put;
+
+       HOSTFS_I(root_inode)->host_filename = name;
+
+       err = -ENOMEM;
+       sb->s_root = d_alloc_root(root_inode);
+       if(sb->s_root == NULL)
+               goto out_put;
+
+       err = read_inode(root_inode);
+       if(err)
+               goto out_put;
+
+       return(0);
+
+ out_put:
+       iput(root_inode);
+ out_free:
+       kfree(name);
+ out:
+       return(err);
+}
+
+static struct super_block *hostfs_read_sb(struct file_system_type *type,
+                                            int flags, const char *dev_name,
+                                            void *data)
+{
+       return(get_sb_nodev(type, flags, data, hostfs_fill_sb_common));
+}
+
+static struct file_system_type hostfs_type = {
+       .owner          = THIS_MODULE,
+       .name           = "hostfs",
+       .get_sb         = hostfs_read_sb,
+       .kill_sb        = kill_anon_super,
+       .fs_flags       = 0,
+};
+
+static int __init init_hostfs(void)
+{
+       return(register_filesystem(&hostfs_type));
+}
+
+static void __exit exit_hostfs(void)
+{
+       unregister_filesystem(&hostfs_type);
+}
+
+module_init(init_hostfs)
+module_exit(exit_hostfs)
+MODULE_LICENSE("GPL");
+
+/*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+ * adjust the settings for this buffer only.  This must remain at the end
+ * of the file.
+ * ---------------------------------------------------------------------------
+ * Local variables:
+ * c-file-style: "linux"
+ * End:
+ */
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
new file mode 100644 (file)
index 0000000..c406266
--- /dev/null
@@ -0,0 +1,361 @@
+/* 
+ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
+ * Licensed under the GPL
+ */
+
+#include <unistd.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <errno.h>
+#include <utime.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/vfs.h>
+#include "hostfs.h"
+#include "kern_util.h"
+#include "user.h"
+
+int stat_file(const char *path, unsigned long long *inode_out, int *mode_out,
+             int *nlink_out, int *uid_out, int *gid_out, 
+             unsigned long long *size_out, struct timespec *atime_out,
+             struct timespec *mtime_out, struct timespec *ctime_out,
+             int *blksize_out, unsigned long long *blocks_out)
+{
+       struct stat64 buf;
+
+       if(lstat64(path, &buf) < 0) 
+               return(-errno);
+
+       /* See the Makefile for why STAT64_INO_FIELD is passed in
+        * by the build
+        */
+       if(inode_out != NULL) *inode_out = buf.STAT64_INO_FIELD;
+       if(mode_out != NULL) *mode_out = buf.st_mode;
+       if(nlink_out != NULL) *nlink_out = buf.st_nlink;
+       if(uid_out != NULL) *uid_out = buf.st_uid;
+       if(gid_out != NULL) *gid_out = buf.st_gid;
+       if(size_out != NULL) *size_out = buf.st_size;
+       if(atime_out != NULL) {
+               atime_out->tv_sec = buf.st_atime;
+               atime_out->tv_nsec = 0;
+       }
+       if(mtime_out != NULL) {
+               mtime_out->tv_sec = buf.st_mtime;
+               mtime_out->tv_nsec = 0;
+       }
+       if(ctime_out != NULL) {
+               ctime_out->tv_sec = buf.st_ctime;
+               ctime_out->tv_nsec = 0;
+       }
+       if(blksize_out != NULL) *blksize_out = buf.st_blksize;
+       if(blocks_out != NULL) *blocks_out = buf.st_blocks;
+       return(0);
+}
+
+int file_type(const char *path, int *rdev)
+{
+       struct stat64 buf;
+
+       if(lstat64(path, &buf) < 0) 
+               return(-errno);
+       if(rdev != NULL) 
+               *rdev = buf.st_rdev;
+
+       if(S_ISDIR(buf.st_mode)) return(OS_TYPE_DIR);
+       else if(S_ISLNK(buf.st_mode)) return(OS_TYPE_SYMLINK);
+       else if(S_ISCHR(buf.st_mode)) return(OS_TYPE_CHARDEV);
+       else if(S_ISBLK(buf.st_mode)) return(OS_TYPE_BLOCKDEV);
+       else if(S_ISFIFO(buf.st_mode))return(OS_TYPE_FIFO);
+       else if(S_ISSOCK(buf.st_mode))return(OS_TYPE_SOCK);
+       else return(OS_TYPE_FILE);
+}
+
+int access_file(char *path, int r, int w, int x)
+{
+       int mode = 0;
+
+       if(r) mode = R_OK;
+       if(w) mode |= W_OK;
+       if(x) mode |= X_OK;
+       if(access(path, mode) != 0) return(-errno);
+       else return(0);
+}
+
+int open_file(char *path, int r, int w, int append)
+{
+       int mode = 0, fd;
+
+       if(r && !w) 
+               mode = O_RDONLY;
+       else if(!r && w) 
+               mode = O_WRONLY;
+       else if(r && w) 
+               mode = O_RDWR;
+       else panic("Impossible mode in open_file");
+
+       if(append)
+               mode |= O_APPEND;
+       fd = open64(path, mode);
+       if(fd < 0) return(-errno);
+       else return(fd);
+}
+
+void *open_dir(char *path, int *err_out)
+{
+       DIR *dir;
+
+       dir = opendir(path);
+       *err_out = errno;
+       if(dir == NULL) return(NULL);
+       return(dir);
+}
+
+char *read_dir(void *stream, unsigned long long *pos, 
+              unsigned long long *ino_out, int *len_out)
+{
+       DIR *dir = stream;
+       struct dirent *ent;
+
+       seekdir(dir, *pos);
+       ent = readdir(dir);
+       if(ent == NULL) return(NULL);
+       *len_out = strlen(ent->d_name);
+       *ino_out = ent->d_ino;
+       *pos = telldir(dir);
+       return(ent->d_name);
+}
+
+int read_file(int fd, unsigned long long *offset, char *buf, int len)
+{
+       int n;
+
+       n = pread64(fd, buf, len, *offset);
+       if(n < 0) return(-errno);
+       *offset += n;
+       return(n);
+}
+
+int write_file(int fd, unsigned long long *offset, const char *buf, int len)
+{
+       int n;
+
+       n = pwrite64(fd, buf, len, *offset);
+       if(n < 0) return(-errno);
+       *offset += n;
+       return(n);
+}
+
+int lseek_file(int fd, long long offset, int whence)
+{
+       int ret;
+
+       ret = lseek64(fd, offset, whence);
+       if(ret < 0) return(-errno);
+       return(0);
+}
+
+void close_file(void *stream)
+{
+       close(*((int *) stream));
+}
+
+void close_dir(void *stream)
+{
+       closedir(stream);
+}
+
+int file_create(char *name, int ur, int uw, int ux, int gr, 
+               int gw, int gx, int or, int ow, int ox)
+{
+       int mode, fd;
+
+       mode = 0;
+       mode |= ur ? S_IRUSR : 0;
+       mode |= uw ? S_IWUSR : 0;
+       mode |= ux ? S_IXUSR : 0;
+       mode |= gr ? S_IRGRP : 0;
+       mode |= gw ? S_IWGRP : 0;
+       mode |= gx ? S_IXGRP : 0;
+       mode |= or ? S_IROTH : 0;
+       mode |= ow ? S_IWOTH : 0;
+       mode |= ox ? S_IXOTH : 0;
+       fd = open64(name, O_CREAT | O_RDWR, mode);
+       if(fd < 0) 
+               return(-errno);
+       return(fd);
+}
+
+int set_attr(const char *file, struct hostfs_iattr *attrs)
+{
+       struct utimbuf buf;
+       int err, ma;
+
+       if(attrs->ia_valid & HOSTFS_ATTR_MODE){
+               if(chmod(file, attrs->ia_mode) != 0) return(-errno);
+       }
+       if(attrs->ia_valid & HOSTFS_ATTR_UID){
+               if(chown(file, attrs->ia_uid, -1)) return(-errno);
+       }
+       if(attrs->ia_valid & HOSTFS_ATTR_GID){
+               if(chown(file, -1, attrs->ia_gid)) return(-errno);
+       }
+       if(attrs->ia_valid & HOSTFS_ATTR_SIZE){
+               if(truncate(file, attrs->ia_size)) return(-errno);
+       }
+       ma = HOSTFS_ATTR_ATIME_SET | HOSTFS_ATTR_MTIME_SET;
+       if((attrs->ia_valid & ma) == ma){
+               buf.actime = attrs->ia_atime.tv_sec;
+               buf.modtime = attrs->ia_mtime.tv_sec;
+               if(utime(file, &buf) != 0) return(-errno);
+       }
+       else {
+               struct timespec ts;
+
+               if(attrs->ia_valid & HOSTFS_ATTR_ATIME_SET){
+                       err = stat_file(file, NULL, NULL, NULL, NULL, NULL, 
+                                       NULL, NULL, &ts, NULL, NULL, NULL);
+                       if(err != 0) 
+                               return(err);
+                       buf.actime = attrs->ia_atime.tv_sec;
+                       buf.modtime = ts.tv_sec;
+                       if(utime(file, &buf) != 0) 
+                               return(-errno);
+               }
+               if(attrs->ia_valid & HOSTFS_ATTR_MTIME_SET){
+                       err = stat_file(file, NULL, NULL, NULL, NULL, NULL, 
+                                       NULL, &ts, NULL, NULL, NULL, NULL);
+                       if(err != 0) 
+                               return(err);
+                       buf.actime = ts.tv_sec;
+                       buf.modtime = attrs->ia_mtime.tv_sec;
+                       if(utime(file, &buf) != 0) 
+                               return(-errno);
+               }
+       }
+       if(attrs->ia_valid & HOSTFS_ATTR_CTIME) ;
+       if(attrs->ia_valid & (HOSTFS_ATTR_ATIME | HOSTFS_ATTR_MTIME)){
+               err = stat_file(file, NULL, NULL, NULL, NULL, NULL, NULL, 
+                               &attrs->ia_atime, &attrs->ia_mtime, NULL, 
+                               NULL, NULL);
+               if(err != 0) return(err);
+       }
+       return(0);
+}
+
+int make_symlink(const char *from, const char *to)
+{
+       int err;
+
+       err = symlink(to, from);
+       if(err) return(-errno);
+       return(0);
+}
+
+int unlink_file(const char *file)
+{
+       int err;
+
+       err = unlink(file);
+       if(err) return(-errno);
+       return(0);
+}
+
+int do_mkdir(const char *file, int mode)
+{
+       int err;
+
+       err = mkdir(file, mode);
+       if(err) return(-errno);
+       return(0);
+}
+
+int do_rmdir(const char *file)
+{
+       int err;
+
+       err = rmdir(file);
+       if(err) return(-errno);
+       return(0);
+}
+
+int do_mknod(const char *file, int mode, int dev)
+{
+       int err;
+
+       err = mknod(file, mode, dev);
+       if(err) return(-errno);
+       return(0);
+}
+
+int link_file(const char *to, const char *from)
+{
+       int err;
+
+       err = link(to, from);
+       if(err) return(-errno);
+       return(0);
+}
+
+int do_readlink(char *file, char *buf, int size)
+{
+       int n;
+
+       n = readlink(file, buf, size);
+       if(n < 0) 
+               return(-errno);
+       if(n < size) 
+               buf[n] = '\0';
+       return(n);
+}
+
+int rename_file(char *from, char *to)
+{
+       int err;
+
+       err = rename(from, to);
+       if(err < 0) return(-errno);
+       return(0);      
+}
+
+int do_statfs(char *root, long *bsize_out, long long *blocks_out, 
+             long long *bfree_out, long long *bavail_out, 
+             long long *files_out, long long *ffree_out,
+             void *fsid_out, int fsid_size, long *namelen_out, 
+             long *spare_out)
+{
+       struct statfs64 buf;
+       int err;
+
+       err = statfs64(root, &buf);
+       if(err < 0) return(-errno);
+       *bsize_out = buf.f_bsize;
+       *blocks_out = buf.f_blocks;
+       *bfree_out = buf.f_bfree;
+       *bavail_out = buf.f_bavail;
+       *files_out = buf.f_files;
+       *ffree_out = buf.f_ffree;
+       memcpy(fsid_out, &buf.f_fsid, 
+              sizeof(buf.f_fsid) > fsid_size ? fsid_size : 
+              sizeof(buf.f_fsid));
+       *namelen_out = buf.f_namelen;
+       spare_out[0] = buf.f_spare[0];
+       spare_out[1] = buf.f_spare[1];
+       spare_out[2] = buf.f_spare[2];
+       spare_out[3] = buf.f_spare[3];
+       spare_out[4] = buf.f_spare[4];
+       spare_out[5] = buf.f_spare[5];
+       return(0);
+}
+
+/*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+ * adjust the settings for this buffer only.  This must remain at the end
+ * of the file.
+ * ---------------------------------------------------------------------------
+ * Local variables:
+ * c-file-style: "linux"
+ * End:
+ */
diff --git a/fs/hppfs/Makefile b/fs/hppfs/Makefile
new file mode 100644 (file)
index 0000000..e67f038
--- /dev/null
@@ -0,0 +1,19 @@
+# 
+# Copyright (C) 2002, 2003 Jeff Dike (jdike@karaya.com)
+# Licensed under the GPL
+#
+
+hppfs-objs := hppfs_kern.o
+
+obj-y = 
+obj-$(CONFIG_HPPFS) += hppfs.o
+
+clean:
+
+modules:
+
+fastdep:
+
+dep:
+
+archmrproper: clean
diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c
new file mode 100644 (file)
index 0000000..ebf08cb
--- /dev/null
@@ -0,0 +1,811 @@
+/* 
+ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
+ * Licensed under the GPL
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/dcache.h>
+#include <linux/statfs.h>
+#include <asm/uaccess.h>
+#include <asm/fcntl.h>
+#include "os.h"
+
+static int init_inode(struct inode *inode, struct dentry *dentry);
+
+struct hppfs_data {
+       struct list_head list;
+       char contents[PAGE_SIZE - sizeof(struct list_head)];
+};
+
+struct hppfs_private {
+       struct file proc_file;
+       int host_fd;
+       loff_t len;
+       struct hppfs_data *contents;
+};
+
+struct hppfs_inode_info {
+        struct dentry *proc_dentry;
+       struct inode vfs_inode;
+};
+
+static inline struct hppfs_inode_info *HPPFS_I(struct inode *inode)
+{
+       return(list_entry(inode, struct hppfs_inode_info, vfs_inode));
+}
+
+#define HPPFS_SUPER_MAGIC 0xb00000ee
+
+static struct super_operations hppfs_sbops;
+
+static int is_pid(struct dentry *dentry)
+{
+       struct super_block *sb;
+       int i;
+
+       sb = dentry->d_sb;
+       if((sb->s_op != &hppfs_sbops) || (dentry->d_parent != sb->s_root))
+               return(0);
+
+       for(i = 0; i < dentry->d_name.len; i++){
+               if(!isdigit(dentry->d_name.name[i]))
+                       return(0);
+       }
+       return(1);
+}
+
+static char *dentry_name(struct dentry *dentry, int extra)
+{
+       struct dentry *parent;
+       char *root, *name;
+       const char *seg_name;
+       int len, seg_len;
+
+       len = 0;
+       parent = dentry;
+       while(parent->d_parent != parent){
+               if(is_pid(parent))
+                       len += strlen("pid") + 1;
+               else len += parent->d_name.len + 1;
+               parent = parent->d_parent;
+       }
+       
+       root = "proc";
+       len += strlen(root);
+       name = kmalloc(len + extra + 1, GFP_KERNEL);
+       if(name == NULL) return(NULL);
+
+       name[len] = '\0';
+       parent = dentry;
+       while(parent->d_parent != parent){
+               if(is_pid(parent)){
+                       seg_name = "pid";
+                       seg_len = strlen("pid");
+               }
+               else {
+                       seg_name = parent->d_name.name;
+                       seg_len = parent->d_name.len;
+               }
+
+               len -= seg_len + 1;
+               name[len] = '/';
+               strncpy(&name[len + 1], seg_name, seg_len);
+               parent = parent->d_parent;
+       }
+       strncpy(name, root, strlen(root));
+       return(name);
+}
+
+struct dentry_operations hppfs_dentry_ops = {
+};
+
+static int file_removed(struct dentry *dentry, const char *file)
+{
+       char *host_file;
+       int extra, fd;
+
+       extra = 0;
+       if(file != NULL) extra += strlen(file) + 1;
+
+       host_file = dentry_name(dentry, extra + strlen("/remove"));
+       if(host_file == NULL){
+               printk("file_removed : allocation failed\n");
+               return(-ENOMEM);
+       }
+
+       if(file != NULL){
+               strcat(host_file, "/");
+               strcat(host_file, file);
+       }
+       strcat(host_file, "/remove");
+
+       fd = os_open_file(host_file, of_read(OPENFLAGS()), 0);
+       kfree(host_file);
+       if(fd > 0){
+               os_close_file(fd);
+               return(1);
+       }
+       return(0);
+}
+
+static void hppfs_read_inode(struct inode *ino)
+{
+       struct inode *proc_ino;
+
+       if(HPPFS_I(ino)->proc_dentry == NULL)
+               return;
+
+       proc_ino = HPPFS_I(ino)->proc_dentry->d_inode;
+       ino->i_uid = proc_ino->i_uid;
+       ino->i_gid = proc_ino->i_gid;
+       ino->i_atime = proc_ino->i_atime;
+       ino->i_mtime = proc_ino->i_mtime;
+       ino->i_ctime = proc_ino->i_ctime;
+       ino->i_ino = proc_ino->i_ino;
+       ino->i_mode = proc_ino->i_mode;
+       ino->i_nlink = proc_ino->i_nlink;
+       ino->i_size = proc_ino->i_size;
+       ino->i_blksize = proc_ino->i_blksize;
+       ino->i_blocks = proc_ino->i_blocks;
+}
+
+static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry, 
+                                  struct nameidata *nd)
+{
+       struct dentry *proc_dentry, *new, *parent;
+       struct inode *inode;
+       int err, deleted;
+
+       deleted = file_removed(dentry, NULL);
+       if(deleted < 0)
+               return(ERR_PTR(deleted));
+       else if(deleted)
+               return(ERR_PTR(-ENOENT));
+
+       err = -ENOMEM;
+       parent = HPPFS_I(ino)->proc_dentry;
+       down(&parent->d_inode->i_sem);
+       proc_dentry = d_lookup(parent, &dentry->d_name);
+       if(proc_dentry == NULL){
+               proc_dentry = d_alloc(parent, &dentry->d_name);
+               if(proc_dentry == NULL){
+                       up(&parent->d_inode->i_sem);
+                       goto out;
+               }
+               new = (*parent->d_inode->i_op->lookup)(parent->d_inode, 
+                                                      proc_dentry, NULL);
+               if(new){
+                       dput(proc_dentry);
+                       proc_dentry = new;
+               }
+       }
+       up(&parent->d_inode->i_sem);
+
+       if(IS_ERR(proc_dentry))
+               return(proc_dentry);
+
+       inode = iget(ino->i_sb, 0);
+       if(inode == NULL) 
+               goto out_dput;
+
+       err = init_inode(inode, proc_dentry);
+       if(err) 
+               goto out_put;
+       
+       hppfs_read_inode(inode);
+
+       d_add(dentry, inode);
+       dentry->d_op = &hppfs_dentry_ops;
+       return(NULL);
+
+ out_put:
+       iput(inode);
+ out_dput:
+       dput(proc_dentry);
+ out:
+       return(ERR_PTR(err));
+}
+
+static struct inode_operations hppfs_file_iops = {
+};
+
+static ssize_t read_proc(struct file *file, char *buf, ssize_t count, 
+                        loff_t *ppos, int is_user)
+{
+       ssize_t (*read)(struct file *, char *, size_t, loff_t *);
+       ssize_t n;
+
+       read = file->f_dentry->d_inode->i_fop->read;
+
+       if(!is_user)
+               set_fs(KERNEL_DS);
+               
+       n = (*read)(file, buf, count, &file->f_pos);
+
+       if(!is_user)
+               set_fs(USER_DS);
+
+       if(ppos) *ppos = file->f_pos;
+       return(n);
+}
+
+static ssize_t hppfs_read_file(int fd, char *buf, ssize_t count)
+{
+       ssize_t n;
+       int cur, err;
+       char *new_buf;
+
+       n = -ENOMEM;
+       new_buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+       if(new_buf == NULL){
+               printk("hppfs_read_file : kmalloc failed\n");
+               goto out;
+       }
+       n = 0;
+       while(count > 0){
+               cur = min_t(ssize_t, count, PAGE_SIZE);
+               err = os_read_file(fd, new_buf, cur);
+               if(err < 0){
+                       printk("hppfs_read : read failed, errno = %d\n",
+                              count);
+                       n = err;
+                       goto out_free;
+               }
+               else if(err == 0)
+                       break;
+
+               if(copy_to_user(buf, new_buf, err)){
+                       n = -EFAULT;
+                       goto out_free;
+               }
+               n += err;
+               count -= err;
+       }
+ out_free:
+       kfree(new_buf);
+ out:
+       return(n);
+}
+
+static ssize_t hppfs_read(struct file *file, char *buf, size_t count, 
+                         loff_t *ppos)
+{
+       struct hppfs_private *hppfs = file->private_data;
+       struct hppfs_data *data;
+       loff_t off;
+       int err;
+
+       if(hppfs->contents != NULL){
+               if(*ppos >= hppfs->len) return(0);
+
+               data = hppfs->contents;
+               off = *ppos;
+               while(off >= sizeof(data->contents)){
+                       data = list_entry(data->list.next, struct hppfs_data,
+                                         list);
+                       off -= sizeof(data->contents);
+               }
+
+               if(off + count > hppfs->len)
+                       count = hppfs->len - off;
+               copy_to_user(buf, &data->contents[off], count);
+               *ppos += count;
+       }
+       else if(hppfs->host_fd != -1){
+               err = os_seek_file(hppfs->host_fd, *ppos);
+               if(err){
+                       printk("hppfs_read : seek failed, errno = %d\n", err);
+                       return(err);
+               }
+               count = hppfs_read_file(hppfs->host_fd, buf, count);
+               if(count > 0)
+                       *ppos += count;
+       }
+       else count = read_proc(&hppfs->proc_file, buf, count, ppos, 1);
+
+       return(count);
+}
+
+static ssize_t hppfs_write(struct file *file, const char *buf, size_t len, 
+                          loff_t *ppos)
+{
+       struct hppfs_private *data = file->private_data;
+       struct file *proc_file = &data->proc_file;
+       ssize_t (*write)(struct file *, const char *, size_t, loff_t *);
+       int err;
+
+       write = proc_file->f_dentry->d_inode->i_fop->write;
+
+       proc_file->f_pos = file->f_pos;
+       err = (*write)(proc_file, buf, len, &proc_file->f_pos);
+       file->f_pos = proc_file->f_pos;
+
+       return(err);
+}
+
+static int open_host_sock(char *host_file, int *filter_out)
+{
+       char *end;
+       int fd;
+
+       end = &host_file[strlen(host_file)];
+       strcpy(end, "/rw");
+       *filter_out = 1;
+       fd = os_connect_socket(host_file);
+       if(fd > 0)
+               return(fd);
+
+       strcpy(end, "/r");
+       *filter_out = 0;
+       fd = os_connect_socket(host_file);
+       return(fd);
+}
+
+static void free_contents(struct hppfs_data *head)
+{
+       struct hppfs_data *data;
+       struct list_head *ele, *next;
+
+       if(head == NULL) return;
+
+       list_for_each_safe(ele, next, &head->list){
+               data = list_entry(ele, struct hppfs_data, list);
+               kfree(data);
+       }
+       kfree(head);
+}
+
+static struct hppfs_data *hppfs_get_data(int fd, int filter, 
+                                        struct file *proc_file, 
+                                        struct file *hppfs_file, 
+                                        loff_t *size_out)
+{
+       struct hppfs_data *data, *new, *head;
+       int n, err;
+
+       err = -ENOMEM;
+       data = kmalloc(sizeof(*data), GFP_KERNEL);
+       if(data == NULL){
+               printk("hppfs_get_data : head allocation failed\n");
+               goto failed;
+       }
+
+       INIT_LIST_HEAD(&data->list);
+
+       head = data;
+       *size_out = 0;
+
+       if(filter){
+               while((n = read_proc(proc_file, data->contents,
+                                    sizeof(data->contents), NULL, 0)) > 0)
+                       os_write_file(fd, data->contents, n);
+               err = os_shutdown_socket(fd, 0, 1);
+               if(err){
+                       printk("hppfs_get_data : failed to shut down "
+                              "socket\n");
+                       goto failed_free;
+               }
+       }
+       while(1){
+               n = os_read_file(fd, data->contents, sizeof(data->contents));
+               if(n < 0){
+                       err = n;
+                       printk("hppfs_get_data : read failed, errno = %d\n",
+                              err);
+                       goto failed_free;
+               }
+               else if(n == 0)
+                       break;
+
+               *size_out += n;
+
+               if(n < sizeof(data->contents))
+                       break;
+
+               new = kmalloc(sizeof(*data), GFP_KERNEL);
+               if(new == 0){
+                       printk("hppfs_get_data : data allocation failed\n");
+                       err = -ENOMEM;
+                       goto failed_free;
+               }
+       
+               INIT_LIST_HEAD(&new->list);
+               list_add(&new->list, &data->list);
+               data = new;
+       }
+       return(head);
+
+ failed_free:
+       free_contents(head);
+ failed:               
+       return(ERR_PTR(err));
+}
+
+static struct hppfs_private *hppfs_data(void)
+{
+       struct hppfs_private *data;
+
+       data = kmalloc(sizeof(*data), GFP_KERNEL);
+       if(data == NULL)
+               return(data);
+
+       *data = ((struct hppfs_private ) { .host_fd             = -1,
+                                          .len                 = -1,
+                                          .contents            = NULL } );
+       return(data);
+}
+
+static int file_mode(int fmode)
+{
+       if(fmode == (FMODE_READ | FMODE_WRITE))
+               return(O_RDWR);
+       if(fmode == FMODE_READ)
+               return(O_RDONLY);
+       if(fmode == FMODE_WRITE)
+               return(O_WRONLY);
+       return(0);
+}
+
+static int hppfs_open(struct inode *inode, struct file *file)
+{
+       struct hppfs_private *data;
+       struct dentry *proc_dentry;
+       char *host_file;
+       int err, fd, type, filter;
+
+       err = -ENOMEM;
+       data = hppfs_data();
+       if(data == NULL)
+               goto out;
+
+       host_file = dentry_name(file->f_dentry, strlen("/rw"));
+       if(host_file == NULL)
+               goto out_free2;
+
+       proc_dentry = HPPFS_I(inode)->proc_dentry;
+
+       /* XXX This isn't closed anywhere */
+       err = open_private_file(&data->proc_file, proc_dentry, 
+                               file_mode(file->f_mode));
+       if(err)
+               goto out_free1;
+
+       type = os_file_type(host_file);
+       if(type == OS_TYPE_FILE){
+               fd = os_open_file(host_file, of_read(OPENFLAGS()), 0);
+               if(fd >= 0) 
+                       data->host_fd = fd;
+               else printk("hppfs_open : failed to open '%s', errno = %d\n",
+                           host_file, -fd);
+
+               data->contents = NULL;
+       }
+       else if(type == OS_TYPE_DIR){
+               fd = open_host_sock(host_file, &filter);
+               if(fd > 0){
+                       data->contents = hppfs_get_data(fd, filter, 
+                                                       &data->proc_file, 
+                                                       file, &data->len);
+                       if(!IS_ERR(data->contents))
+                               data->host_fd = fd;
+               }
+               else printk("hppfs_open : failed to open a socket in "
+                           "'%s', errno = %d\n", host_file, -fd);
+       }
+       kfree(host_file);
+
+       file->private_data = data;
+       return(0);
+
+ out_free1:
+       kfree(host_file);
+ out_free2:
+       free_contents(data->contents);
+       kfree(data);
+ out:
+       return(err);
+}
+
+static int hppfs_dir_open(struct inode *inode, struct file *file)
+{
+       struct hppfs_private *data;
+       struct dentry *proc_dentry;
+       int err;
+
+       err = -ENOMEM;
+       data = hppfs_data();
+       if(data == NULL)
+               goto out;
+
+       proc_dentry = HPPFS_I(inode)->proc_dentry;
+       err = open_private_file(&data->proc_file, proc_dentry, 
+                               file_mode(file->f_mode));
+       if(err)
+               goto out_free;
+
+       file->private_data = data;
+       return(0);
+
+ out_free:
+       kfree(data);
+ out:
+       return(err);
+}
+
+static loff_t hppfs_llseek(struct file *file, loff_t off, int where)
+{
+       struct hppfs_private *data = file->private_data;
+       struct file *proc_file = &data->proc_file;
+       loff_t (*llseek)(struct file *, loff_t, int);
+       loff_t ret;
+
+       llseek = proc_file->f_dentry->d_inode->i_fop->llseek;
+       if(llseek != NULL){
+               ret = (*llseek)(proc_file, off, where);
+               if(ret < 0)
+                       return(ret);
+       }
+
+       return(default_llseek(file, off, where));
+}
+
+static struct file_operations hppfs_file_fops = {
+       .owner          = NULL,
+       .llseek         = hppfs_llseek,
+       .read           = hppfs_read,
+       .write          = hppfs_write,
+       .open           = hppfs_open,
+};
+
+struct hppfs_dirent {
+       void *vfs_dirent;
+       filldir_t filldir;
+       struct dentry *dentry;
+};
+
+static int hppfs_filldir(void *d, const char *name, int size, 
+                        loff_t offset, ino_t inode, unsigned int type)
+{
+       struct hppfs_dirent *dirent = d;
+
+       if(file_removed(dirent->dentry, name))
+               return(0);
+
+       return((*dirent->filldir)(dirent->vfs_dirent, name, size, offset, 
+                                 inode, type));
+}
+
+static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)
+{
+       struct hppfs_private *data = file->private_data;
+       struct file *proc_file = &data->proc_file;
+       int (*readdir)(struct file *, void *, filldir_t);
+       struct hppfs_dirent dirent = ((struct hppfs_dirent)
+                                     { .vfs_dirent     = ent,
+                                       .filldir        = filldir,
+                                       .dentry         = file->f_dentry } );
+       int err;
+
+       readdir = proc_file->f_dentry->d_inode->i_fop->readdir;
+
+       proc_file->f_pos = file->f_pos;
+       err = (*readdir)(proc_file, &dirent, hppfs_filldir);
+       file->f_pos = proc_file->f_pos;
+
+       return(err);
+}
+
+static int hppfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+       return(0);
+}
+
+static struct file_operations hppfs_dir_fops = {
+       .owner          = NULL,
+       .readdir        = hppfs_readdir,
+       .open           = hppfs_dir_open,
+       .fsync          = hppfs_fsync,
+};
+
+static int hppfs_statfs(struct super_block *sb, struct kstatfs *sf)
+{
+       sf->f_blocks = 0;
+       sf->f_bfree = 0;
+       sf->f_bavail = 0;
+       sf->f_files = 0;
+       sf->f_ffree = 0;
+       sf->f_type = HPPFS_SUPER_MAGIC;
+       return(0);
+}
+
+static struct inode *hppfs_alloc_inode(struct super_block *sb)
+{
+       struct hppfs_inode_info *hi;
+
+       hi = kmalloc(sizeof(*hi), GFP_KERNEL);
+       if(hi == NULL) 
+               return(NULL);
+
+       *hi = ((struct hppfs_inode_info) { .proc_dentry = NULL });
+       inode_init_once(&hi->vfs_inode);
+       return(&hi->vfs_inode);
+}
+
+void hppfs_delete_inode(struct inode *ino)
+{
+       clear_inode(ino);
+}
+
+static void hppfs_destroy_inode(struct inode *inode)
+{
+       kfree(HPPFS_I(inode));
+}
+
+static struct super_operations hppfs_sbops = { 
+       .alloc_inode    = hppfs_alloc_inode,
+       .destroy_inode  = hppfs_destroy_inode,
+       .read_inode     = hppfs_read_inode,
+       .delete_inode   = hppfs_delete_inode,
+       .statfs         = hppfs_statfs,
+};
+
+static int hppfs_readlink(struct dentry *dentry, char *buffer, int buflen)
+{
+       struct file proc_file;
+       struct dentry *proc_dentry;
+       int (*readlink)(struct dentry *, char *, int);
+       int err, n;
+
+       proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
+       err = open_private_file(&proc_file, proc_dentry, O_RDONLY);
+       if(err) 
+               return(err);
+
+       readlink = proc_dentry->d_inode->i_op->readlink;
+       n = (*readlink)(proc_dentry, buffer, buflen);
+
+       close_private_file(&proc_file);
+       
+       return(n);
+}
+
+static int hppfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+       struct file proc_file;
+       struct dentry *proc_dentry;
+       int (*follow_link)(struct dentry *, struct nameidata *);
+       int err, n;
+
+       proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
+       err = open_private_file(&proc_file, proc_dentry, O_RDONLY);
+       if(err) 
+               return(err);
+
+       follow_link = proc_dentry->d_inode->i_op->follow_link;
+       n = (*follow_link)(proc_dentry, nd);
+
+       close_private_file(&proc_file);
+       
+       return(n);
+}
+
+static struct inode_operations hppfs_dir_iops = {
+       .lookup         = hppfs_lookup,
+};
+
+static struct inode_operations hppfs_link_iops = {
+       .readlink       = hppfs_readlink,
+       .follow_link    = hppfs_follow_link,
+};
+
+static int init_inode(struct inode *inode, struct dentry *dentry)
+{
+       if(S_ISDIR(dentry->d_inode->i_mode)){
+               inode->i_op = &hppfs_dir_iops;
+               inode->i_fop = &hppfs_dir_fops;
+       }
+       else if(S_ISLNK(dentry->d_inode->i_mode)){
+               inode->i_op = &hppfs_link_iops;
+               inode->i_fop = &hppfs_file_fops;
+       }
+       else {
+               inode->i_op = &hppfs_file_iops;
+               inode->i_fop = &hppfs_file_fops;
+       }
+
+       HPPFS_I(inode)->proc_dentry = dentry;
+
+       return(0);
+}
+
+static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
+{
+       struct inode *root_inode;
+       struct file_system_type *procfs;
+       struct super_block *proc_sb;
+       int err;
+
+       err = -ENOENT;
+       procfs = get_fs_type("proc");
+       if(procfs == NULL) 
+               goto out;
+
+       if(list_empty(&procfs->fs_supers))
+               goto out;
+
+       proc_sb = list_entry(procfs->fs_supers.next, struct super_block,
+                            s_instances);
+       
+       sb->s_blocksize = 1024;
+       sb->s_blocksize_bits = 10;
+       sb->s_magic = HPPFS_SUPER_MAGIC;
+       sb->s_op = &hppfs_sbops;
+
+       root_inode = iget(sb, 0);
+       if(root_inode == NULL)
+               goto out;
+
+       err = init_inode(root_inode, proc_sb->s_root);
+       if(err)
+               goto out_put;
+
+       err = -ENOMEM;
+       sb->s_root = d_alloc_root(root_inode);
+       if(sb->s_root == NULL)
+               goto out_put;
+
+       hppfs_read_inode(root_inode);
+
+       return(0);
+
+ out_put:
+       iput(root_inode);
+ out:
+       return(err);
+}
+
+static struct super_block *hppfs_read_super(struct file_system_type *type,
+                                            int flags, const char *dev_name,
+                                            void *data)
+{
+       return(get_sb_nodev(type, flags, data, hppfs_fill_super));
+}
+
+static struct file_system_type hppfs_type = {
+       .owner          = THIS_MODULE,
+       .name           = "hppfs",
+       .get_sb         = hppfs_read_super,
+       .kill_sb        = kill_anon_super,
+       .fs_flags       = 0,
+};
+
+static int __init init_hppfs(void)
+{
+       return(register_filesystem(&hppfs_type));
+}
+
+static void __exit exit_hppfs(void)
+{
+       unregister_filesystem(&hppfs_type);
+}
+
+module_init(init_hppfs)
+module_exit(exit_hppfs)
+MODULE_LICENSE("GPL");
+
+/*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+ * adjust the settings for this buffer only.  This must remain at the end
+ * of the file.
+ * ---------------------------------------------------------------------------
+ * Local variables:
+ * c-file-style: "linux"
+ * End:
+ */
diff --git a/fs/relayfs/Makefile b/fs/relayfs/Makefile
new file mode 100644 (file)
index 0000000..09f098a
--- /dev/null
@@ -0,0 +1,8 @@
+#
+# relayfs Makefile
+#
+
+obj-$(CONFIG_RELAYFS_FS) += relayfs.o
+
+relayfs-y := relay.o relay_lockless.o relay_locking.o inode.o resize.o
+relayfs-$(CONFIG_KLOG_CHANNEL) += klog.o
diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
new file mode 100644 (file)
index 0000000..6e87360
--- /dev/null
@@ -0,0 +1,629 @@
+/*
+ * VFS-related code for RelayFS, a high-speed data relay filesystem.
+ *
+ * Copyright (C) 2003 - Tom Zanussi <zanussi@us.ibm.com>, IBM Corp
+ * Copyright (C) 2003 - Karim Yaghmour <karim@opersys.com>
+ *
+ * Based on ramfs, Copyright (C) 2002 - Linus Torvalds
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+#include <asm/uaccess.h>
+#include <asm/relay.h>
+
+#define RELAYFS_MAGIC                  0x26F82121
+
+static struct super_operations         relayfs_ops;
+static struct address_space_operations relayfs_aops;
+static struct inode_operations         relayfs_file_inode_operations;
+static struct file_operations          relayfs_file_operations;
+static struct inode_operations         relayfs_dir_inode_operations;
+
+static struct vfsmount *               relayfs_mount;
+static int                             relayfs_mount_count;
+
+static struct backing_dev_info         relayfs_backing_dev_info = {
+       .ra_pages       = 0,    /* No readahead */
+       .memory_backed  = 1,    /* Does not contribute to dirty memory */
+};
+
+static struct inode *
+relayfs_get_inode(struct super_block *sb, int mode, dev_t dev)
+{
+       struct inode * inode;
+       
+       inode = new_inode(sb);
+
+       if (inode) {
+               inode->i_mode = mode;
+               inode->i_uid = current->fsuid;
+               inode->i_gid = current->fsgid;
+               inode->i_blksize = PAGE_CACHE_SIZE;
+               inode->i_blocks = 0;
+               inode->i_mapping->a_ops = &relayfs_aops;
+               inode->i_mapping->backing_dev_info = &relayfs_backing_dev_info;
+               inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+               switch (mode & S_IFMT) {
+               default:
+                       init_special_inode(inode, mode, dev);
+                       break;
+               case S_IFREG:
+                       inode->i_op = &relayfs_file_inode_operations;
+                       inode->i_fop = &relayfs_file_operations;
+                       break;
+               case S_IFDIR:
+                       inode->i_op = &relayfs_dir_inode_operations;
+                       inode->i_fop = &simple_dir_operations;
+
+                       /* directory inodes start off with i_nlink == 2 (for "." entry) */
+                       inode->i_nlink++;
+                       break;
+               case S_IFLNK:
+                       inode->i_op = &page_symlink_inode_operations;
+                       break;
+               }
+       }
+       return inode;
+}
+
+/*
+ * File creation. Allocate an inode, and we're done..
+ */
+/* SMP-safe */
+static int 
+relayfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+{
+       struct inode * inode;
+       int error = -ENOSPC;
+
+       inode = relayfs_get_inode(dir->i_sb, mode, dev);
+
+       if (inode) {
+               d_instantiate(dentry, inode);
+               dget(dentry);   /* Extra count - pin the dentry in core */
+               error = 0;
+       }
+       return error;
+}
+
+static int 
+relayfs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+{
+       int retval;
+
+       retval = relayfs_mknod(dir, dentry, mode | S_IFDIR, 0);
+
+       if (!retval)
+               dir->i_nlink++;
+       return retval;
+}
+
+static int 
+relayfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
+{
+       return relayfs_mknod(dir, dentry, mode | S_IFREG, 0);
+}
+
+static int 
+relayfs_symlink(struct inode * dir, struct dentry *dentry, const char * symname)
+{
+       struct inode *inode;
+       int error = -ENOSPC;
+
+       inode = relayfs_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0);
+
+       if (inode) {
+               int l = strlen(symname)+1;
+               error = page_symlink(inode, symname, l);
+               if (!error) {
+                       d_instantiate(dentry, inode);
+                       dget(dentry);
+               } else
+                       iput(inode);
+       }
+       return error;
+}
+
+/**
+ *     relayfs_create_entry - create a relayfs directory or file
+ *     @name: the name of the file to create
+ *     @parent: parent directory
+ *     @dentry: result dentry
+ *     @entry_type: type of file to create (S_IFREG, S_IFDIR)
+ *     @mode: mode
+ *     @data: data to associate with the file
+ *
+ *     Creates a file or directory with the specifed permissions.
+ */
+static int 
+relayfs_create_entry(const char * name, struct dentry * parent, struct dentry **dentry, int entry_type, int mode, void * data)
+{
+       struct qstr qname;
+       struct dentry * d;
+       
+       int error = 0;
+
+       error = simple_pin_fs("relayfs", &relayfs_mount, &relayfs_mount_count);
+       if (error) {
+               printk(KERN_ERR "Couldn't mount relayfs: errcode %d\n", error);
+               return error;
+       }
+
+       qname.name = name;
+       qname.len = strlen(name);
+       qname.hash = full_name_hash(name, qname.len);
+
+       if (parent == NULL)
+               if (relayfs_mount && relayfs_mount->mnt_sb)
+                       parent = relayfs_mount->mnt_sb->s_root;
+
+       if (parent == NULL) {
+               simple_release_fs(&relayfs_mount, &relayfs_mount_count);
+               return -EINVAL;
+       }
+
+       parent = dget(parent);
+       down(&parent->d_inode->i_sem);
+       d = lookup_hash(&qname, parent);
+       if (IS_ERR(d)) {
+               error = PTR_ERR(d);
+               goto release_mount;
+       }
+       
+       if (d->d_inode) {
+               error = -EEXIST;
+               goto release_mount;
+       }
+
+       if (entry_type == S_IFREG)
+               error = relayfs_create(parent->d_inode, d, entry_type | mode, NULL);
+       else
+               error = relayfs_mkdir(parent->d_inode, d, entry_type | mode);
+       if (error)
+               goto release_mount;
+
+       if ((entry_type == S_IFREG) && data) {
+               d->d_inode->u.generic_ip = data;
+               goto exit; /* don't release mount for regular files */
+       }
+
+release_mount:
+       simple_release_fs(&relayfs_mount, &relayfs_mount_count);
+exit:  
+       *dentry = d;
+       up(&parent->d_inode->i_sem);
+       dput(parent);
+
+       return error;
+}
+
+/**
+ *     relayfs_create_file - create a file in the relay filesystem
+ *     @name: the name of the file to create
+ *     @parent: parent directory
+ *     @dentry: result dentry
+ *     @data: data to associate with the file
+ *     @mode: mode, if not specied the default perms are used
+ *
+ *     The file will be created user rw on behalf of current user.
+ */
+int 
+relayfs_create_file(const char * name, struct dentry * parent, struct dentry **dentry, void * data, int mode)
+{
+       if (!mode)
+               mode = S_IRUSR | S_IWUSR;
+       
+       return relayfs_create_entry(name, parent, dentry, S_IFREG,
+                                   mode, data);
+}
+
+/**
+ *     relayfs_create_dir - create a directory in the relay filesystem
+ *     @name: the name of the directory to create
+ *     @parent: parent directory
+ *     @dentry: result dentry
+ *
+ *     The directory will be created world rwx on behalf of current user.
+ */
+int 
+relayfs_create_dir(const char * name, struct dentry * parent, struct dentry **dentry)
+{
+       return relayfs_create_entry(name, parent, dentry, S_IFDIR,
+                                   S_IRWXU | S_IRUGO | S_IXUGO, NULL);
+}
+
+/**
+ *     relayfs_remove_file - remove a file in the relay filesystem
+ *     @dentry: file dentry
+ *
+ *     Remove a file previously created by relayfs_create_file.
+ */
+int 
+relayfs_remove_file(struct dentry *dentry)
+{
+       struct dentry *parent;
+       int is_reg;
+       
+       parent = dentry->d_parent;
+       if (parent == NULL)
+               return -EINVAL;
+
+       is_reg = S_ISREG(dentry->d_inode->i_mode);
+
+       parent = dget(parent);
+       down(&parent->d_inode->i_sem);
+       if (dentry->d_inode) {
+               simple_unlink(parent->d_inode, dentry);
+               d_delete(dentry);
+       }
+       dput(dentry);
+       up(&parent->d_inode->i_sem);
+       dput(parent);
+
+       if(is_reg)
+               simple_release_fs(&relayfs_mount, &relayfs_mount_count);
+
+       return 0;
+}
+
+/**
+ *     relayfs_open - open file op for relayfs files
+ *     @inode: the inode
+ *     @filp: the file
+ *
+ *     Associates the channel with the file, and increments the
+ *     channel refcount.  Reads will be 'auto-consuming'.
+ */
+int
+relayfs_open(struct inode *inode, struct file *filp)
+{
+       struct rchan *rchan;
+       struct rchan_reader *reader;
+       int retval = 0;
+
+       if (inode->u.generic_ip) {
+               rchan = (struct rchan *)inode->u.generic_ip;
+               if (rchan == NULL)
+                       return -EACCES;
+               reader = __add_rchan_reader(rchan, filp, 1, 0);
+               if (reader == NULL)
+                       return -ENOMEM;
+               filp->private_data = reader;
+               retval = rchan->callbacks->fileop_notify(rchan->id, filp,
+                                                        RELAY_FILE_OPEN);
+               if (retval == 0)
+                       /* Inc relay channel refcount for file */
+                       rchan_get(rchan->id);
+               else {
+                       __remove_rchan_reader(reader);
+                       retval = -EPERM;
+               }
+       }
+
+       return retval;
+}
+
+/**
+ *     relayfs_mmap - mmap file op for relayfs files
+ *     @filp: the file
+ *     @vma: the vma describing what to map
+ *
+ *     Calls upon relay_mmap_buffer to map the file into user space.
+ */
+int 
+relayfs_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+       struct rchan *rchan;
+       
+       rchan = ((struct rchan_reader *)filp->private_data)->rchan;
+
+       return __relay_mmap_buffer(rchan, vma);
+}
+
+/**
+ *     relayfs_file_read - read file op for relayfs files
+ *     @filp: the file
+ *     @buf: user buf to read into
+ *     @count: bytes requested
+ *     @offset: offset into file
+ *
+ *     Reads count bytes from the channel, or as much as is available within
+ *     the sub-buffer currently being read.  Reads are 'auto-consuming'.
+ *     See relay_read() for details.
+ *
+ *     Returns bytes read on success, 0 or -EAGAIN if nothing available,
+ *     negative otherwise.
+ */
+ssize_t 
+relayfs_file_read(struct file *filp, char * buf, size_t count, loff_t *offset)
+{
+       size_t read_count;
+       struct rchan_reader *reader;
+       u32 dummy; /* all VFS readers are auto-consuming */
+
+       if (offset != &filp->f_pos) /* pread, seeking not supported */
+               return -ESPIPE;
+
+       if (count == 0)
+               return 0;
+
+       reader = (struct rchan_reader *)filp->private_data;
+       read_count = relay_read(reader, buf, count,
+               filp->f_flags & (O_NDELAY | O_NONBLOCK) ? 0 : 1, &dummy);
+
+       return read_count;
+}
+
+/**
+ *     relayfs_file_write - write file op for relayfs files
+ *     @filp: the file
+ *     @buf: user buf to write from
+ *     @count: bytes to write
+ *     @offset: offset into file
+ *
+ *     Reserves a slot in the relay buffer and writes count bytes
+ *     into it.  The current limit for a single write is 2 pages
+ *     worth.  The user_deliver() channel callback will be invoked on
+ *     
+ *     Returns bytes written on success, 0 or -EAGAIN if nothing available,
+ *     negative otherwise.
+ */
+ssize_t 
+relayfs_file_write(struct file *filp, const char *buf, size_t count, loff_t *offset)
+{
+       int write_count;
+       char * write_buf;
+       struct rchan *rchan;
+       int err = 0;
+       void *wrote_pos;
+       struct rchan_reader *reader;
+
+       reader = (struct rchan_reader *)filp->private_data;
+       if (reader == NULL)
+               return -EPERM;
+
+       rchan = reader->rchan;
+       if (rchan == NULL)
+               return -EPERM;
+
+       if (count == 0)
+               return 0;
+
+       /* Change this if need to write more than 2 pages at once */
+       if (count > 2 * PAGE_SIZE)
+               return -EINVAL;
+       
+       write_buf = (char *)__get_free_pages(GFP_KERNEL, 1);
+       if (write_buf == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(write_buf, buf, count))
+               return -EFAULT;
+
+       if (filp->f_flags & (O_NDELAY | O_NONBLOCK)) {
+               write_count = relay_write(rchan->id, write_buf, count, -1, &wrote_pos);
+               if (write_count == 0)
+                       return -EAGAIN;
+       } else {
+               err = wait_event_interruptible(rchan->write_wait,
+                (write_count = relay_write(rchan->id, write_buf, count, -1, &wrote_pos)));
+               if (err)
+                       return err;
+       }
+       
+       free_pages((unsigned long)write_buf, 1);
+       
+        rchan->callbacks->user_deliver(rchan->id, wrote_pos, write_count);
+
+       return write_count;
+}
+
+/**
+ *     relayfs_ioctl - ioctl file op for relayfs files
+ *     @inode: the inode
+ *     @filp: the file
+ *     @cmd: the command
+ *     @arg: command arg
+ *
+ *     Passes the specified cmd/arg to the kernel client.  arg may be a 
+ *     pointer to user-space data, in which case the kernel client is 
+ *     responsible for copying the data to/from user space appropriately.
+ *     The kernel client is also responsible for returning a meaningful
+ *     return value for ioctl calls.
+ *     
+ *     Returns result of relay channel callback, -EPERM if unsuccessful.
+ */
+int
+relayfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg)
+{
+       struct rchan *rchan;
+       struct rchan_reader *reader;
+
+       reader = (struct rchan_reader *)filp->private_data;
+       if (reader == NULL)
+               return -EPERM;
+
+       rchan = reader->rchan;
+       if (rchan == NULL)
+               return -EPERM;
+
+       return rchan->callbacks->ioctl(rchan->id, cmd, arg);
+}
+
+/**
+ *     relayfs_poll - poll file op for relayfs files
+ *     @filp: the file
+ *     @wait: poll table
+ *
+ *     Poll implemention.
+ */
+static unsigned int
+relayfs_poll(struct file *filp, poll_table *wait)
+{
+       struct rchan_reader *reader;
+       unsigned int mask = 0;
+       
+       reader = (struct rchan_reader *)filp->private_data;
+
+       if (reader->rchan->finalized)
+               return POLLERR;
+
+       if (filp->f_mode & FMODE_READ) {
+               poll_wait(filp, &reader->rchan->read_wait, wait);
+               if (!rchan_empty(reader))
+                       mask |= POLLIN | POLLRDNORM;
+       }
+       
+       if (filp->f_mode & FMODE_WRITE) {
+               poll_wait(filp, &reader->rchan->write_wait, wait);
+               if (!rchan_full(reader))
+                       mask |= POLLOUT | POLLWRNORM;
+       }
+       
+       return mask;
+}
+
+/**
+ *     relayfs_release - release file op for relayfs files
+ *     @inode: the inode
+ *     @filp: the file
+ *
+ *     Decrements the channel refcount, as the filesystem is
+ *     no longer using it.
+ */
+int
+relayfs_release(struct inode *inode, struct file *filp)
+{
+       struct rchan_reader *reader;
+       struct rchan *rchan;
+
+       reader = (struct rchan_reader *)filp->private_data;
+       if (reader == NULL || reader->rchan == NULL)
+               return 0;
+       rchan = reader->rchan;
+       
+        rchan->callbacks->fileop_notify(reader->rchan->id, filp,
+                                       RELAY_FILE_CLOSE);
+       __remove_rchan_reader(reader);
+       /* The channel is no longer in use as far as this file is concerned */
+       rchan_put(rchan);
+
+       return 0;
+}
+
+static struct address_space_operations relayfs_aops = {
+       .readpage       = simple_readpage,
+       .prepare_write  = simple_prepare_write,
+       .commit_write   = simple_commit_write
+};
+
+static struct file_operations relayfs_file_operations = {
+       .open           = relayfs_open,
+       .read           = relayfs_file_read,
+       .write          = relayfs_file_write,
+       .ioctl          = relayfs_ioctl,
+       .poll           = relayfs_poll,
+       .mmap           = relayfs_mmap,
+       .fsync          = simple_sync_file,
+       .release        = relayfs_release,
+};
+
+static struct inode_operations relayfs_file_inode_operations = {
+       .getattr        = simple_getattr,
+};
+
+static struct inode_operations relayfs_dir_inode_operations = {
+       .create         = relayfs_create,
+       .lookup         = simple_lookup,
+       .link           = simple_link,
+       .unlink         = simple_unlink,
+       .symlink        = relayfs_symlink,
+       .mkdir          = relayfs_mkdir,
+       .rmdir          = simple_rmdir,
+       .mknod          = relayfs_mknod,
+       .rename         = simple_rename,
+};
+
+static struct super_operations relayfs_ops = {
+       .statfs         = simple_statfs,
+       .drop_inode     = generic_delete_inode,
+};
+
+static int 
+relayfs_fill_super(struct super_block * sb, void * data, int silent)
+{
+       struct inode * inode;
+       struct dentry * root;
+
+       sb->s_blocksize = PAGE_CACHE_SIZE;
+       sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+       sb->s_magic = RELAYFS_MAGIC;
+       sb->s_op = &relayfs_ops;
+       inode = relayfs_get_inode(sb, S_IFDIR | 0755, 0);
+
+       if (!inode)
+               return -ENOMEM;
+
+       root = d_alloc_root(inode);
+       if (!root) {
+               iput(inode);
+               return -ENOMEM;
+       }
+       sb->s_root = root;
+
+       return 0;
+}
+
+static struct super_block *
+relayfs_get_sb(struct file_system_type *fs_type,
+       int flags, const char *dev_name, void *data)
+{
+       return get_sb_single(fs_type, flags, data, relayfs_fill_super);
+}
+
+static struct file_system_type relayfs_fs_type = {
+       .owner          = THIS_MODULE,
+       .name           = "relayfs",
+       .get_sb         = relayfs_get_sb,
+       .kill_sb        = kill_litter_super,
+};
+
+static int __init 
+init_relayfs_fs(void)
+{
+       int err = register_filesystem(&relayfs_fs_type);
+#ifdef CONFIG_KLOG_CHANNEL
+       if (!err)
+               create_klog_channel();
+#endif
+       return err;
+}
+
+static void __exit 
+exit_relayfs_fs(void)
+{
+#ifdef CONFIG_KLOG_CHANNEL
+       remove_klog_channel();
+#endif
+       unregister_filesystem(&relayfs_fs_type);
+}
+
+module_init(init_relayfs_fs)
+module_exit(exit_relayfs_fs)
+
+MODULE_AUTHOR("Tom Zanussi <zanussi@us.ibm.com> and Karim Yaghmour <karim@opersys.com>");
+MODULE_DESCRIPTION("Relay Filesystem");
+MODULE_LICENSE("GPL");
+
diff --git a/fs/relayfs/relay.c b/fs/relayfs/relay.c
new file mode 100644 (file)
index 0000000..11f4636
--- /dev/null
@@ -0,0 +1,1911 @@
+/*
+ * Public API and common code for RelayFS.
+ *
+ * Please see Documentation/filesystems/relayfs.txt for API description.
+ * 
+ * Copyright (C) 2002, 2003 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
+ * Copyright (C) 1999, 2000, 2001, 2002 - Karim Yaghmour (karim@opersys.com)
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/time.h>
+#include <linux/page-flags.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/delay.h>
+
+#include <asm/io.h>
+#include <asm/current.h>
+#include <asm/uaccess.h>
+#include <asm/bitops.h>
+#include <asm/pgtable.h>
+#include <asm/relay.h>
+#include <asm/hardirq.h>
+
+#include "relay_lockless.h"
+#include "relay_locking.h"
+#include "resize.h"
+
+/* Relay channel table, indexed by channel id */
+static struct rchan *  rchan_table[RELAY_MAX_CHANNELS];
+static rwlock_t                rchan_table_lock = RW_LOCK_UNLOCKED;
+
+/* Relay operation structs, one per scheme */
+static struct relay_ops lockless_ops = {
+       .reserve = lockless_reserve,
+       .commit = lockless_commit,
+       .get_offset = lockless_get_offset,
+       .finalize = lockless_finalize,
+       .reset = lockless_reset,
+       .reset_index = lockless_reset_index
+};
+
+static struct relay_ops locking_ops = {
+       .reserve = locking_reserve,
+       .commit = locking_commit,
+       .get_offset = locking_get_offset,
+       .finalize = locking_finalize,
+       .reset = locking_reset,
+       .reset_index = locking_reset_index
+};
+
+/*
+ * Low-level relayfs kernel API.  These functions should not normally be 
+ * used by clients.  See high-level kernel API below.
+ */
+
+/**
+ *     rchan_get - get channel associated with id, incrementing refcount 
+ *     @rchan_id: the channel id
+ *
+ *     Returns channel if successful, NULL otherwise.
+ */
+struct rchan *
+rchan_get(int rchan_id)
+{
+       struct rchan *rchan;
+       
+       if ((rchan_id < 0) || (rchan_id >= RELAY_MAX_CHANNELS))
+               return NULL;
+       
+       read_lock(&rchan_table_lock);
+       rchan = rchan_table[rchan_id];
+       if (rchan)
+               atomic_inc(&rchan->refcount);
+       read_unlock(&rchan_table_lock);
+
+       return rchan;
+}
+
+/**
+ *     clear_readers - clear non-VFS readers
+ *     @rchan: the channel
+ *
+ *     Clear the channel pointers of all non-VFS readers open on the channel.
+ */
+static inline void
+clear_readers(struct rchan *rchan)
+{
+       struct list_head *p;
+       struct rchan_reader *reader;
+       
+       read_lock(&rchan->open_readers_lock);
+       list_for_each(p, &rchan->open_readers) {
+               reader = list_entry(p, struct rchan_reader, list);
+               if (!reader->vfs_reader)
+                       reader->rchan = NULL;
+       }
+       read_unlock(&rchan->open_readers_lock);
+}
+
+/**
+ *     rchan_alloc_id - reserve a channel id and store associated channel
+ *     @rchan: the channel
+ *
+ *     Returns channel id if successful, -1 otherwise.
+ */
+static inline int
+rchan_alloc_id(struct rchan *rchan)
+{
+       int i;
+       int rchan_id = -1;
+       
+       if (rchan == NULL)
+               return -1;
+
+       write_lock(&rchan_table_lock);
+       for (i = 0; i < RELAY_MAX_CHANNELS; i++) {
+               if (rchan_table[i] == NULL) {
+                       rchan_table[i] = rchan;
+                       rchan_id = rchan->id = i;
+                       break;
+               }
+       }
+       if (rchan_id != -1)
+               atomic_inc(&rchan->refcount);
+       write_unlock(&rchan_table_lock);
+       
+       return rchan_id;
+}
+
+/**
+ *     rchan_free_id - revoke a channel id and remove associated channel
+ *     @rchan_id: the channel id
+ */
+static inline void
+rchan_free_id(int rchan_id)
+{
+       struct rchan *rchan;
+
+       if ((rchan_id < 0) || (rchan_id >= RELAY_MAX_CHANNELS))
+               return;
+
+       write_lock(&rchan_table_lock);
+       rchan = rchan_table[rchan_id];
+       rchan_table[rchan_id] = NULL;
+       write_unlock(&rchan_table_lock);
+}
+
+/**
+ *     rchan_destroy_buf - destroy the current channel buffer
+ *     @rchan: the channel
+ */
+static inline void
+rchan_destroy_buf(struct rchan *rchan)
+{
+       if (rchan->buf && !rchan->init_buf)
+               free_rchan_buf(rchan->buf,
+                              rchan->buf_page_array,
+                              rchan->buf_page_count);
+}
+
+/**
+ *     relay_release - perform end-of-buffer processing for last buffer
+ *     @rchan: the channel
+ *
+ *     Returns 0 if successful, negative otherwise.
+ *
+ *     Releases the channel buffer, destroys the channel, and removes the
+ *     relay file from the relayfs filesystem.  Should only be called from 
+ *     rchan_put().  If we're here, it means by definition refcount is 0.
+ */
+static int 
+relay_release(struct rchan *rchan)
+{
+       if (rchan == NULL)
+               return -EBADF;
+
+       rchan_destroy_buf(rchan);
+       rchan_free_id(rchan->id);
+       relayfs_remove_file(rchan->dentry);
+       clear_readers(rchan);
+       kfree(rchan);
+
+       return 0;
+}
+
+/**
+ *     rchan_get - decrement channel refcount, releasing it if 0
+ *     @rchan: the channel
+ *
+ *     If the refcount reaches 0, the channel will be destroyed.
+ */
+void 
+rchan_put(struct rchan *rchan)
+{
+       if (atomic_dec_and_test(&rchan->refcount))
+               relay_release(rchan);
+}
+
+/**
+ *     relay_reserve -  reserve a slot in the channel buffer
+ *     @rchan: the channel
+ *     @len: the length of the slot to reserve
+ *     @td: the time delta between buffer start and current write, or TSC
+ *     @err: receives the result flags
+ *     @interrupting: 1 if interrupting previous, used only in locking scheme
+ *
+ *     Returns pointer to the beginning of the reserved slot, NULL if error.
+ *
+ *     The errcode value contains the result flags and is an ORed combination 
+ *     of the following:
+ *
+ *     RELAY_BUFFER_SWITCH_NONE - no buffer switch occurred
+ *     RELAY_EVENT_DISCARD_NONE - event should not be discarded
+ *     RELAY_BUFFER_SWITCH - buffer switch occurred
+ *     RELAY_EVENT_DISCARD - event should be discarded (all buffers are full)
+ *     RELAY_EVENT_TOO_LONG - event won't fit into even an empty buffer
+ *
+ *     buffer_start and buffer_end callbacks are triggered at this point
+ *     if applicable.
+ */
+char *
+relay_reserve(struct rchan *rchan,
+             u32 len,
+             struct timeval *ts,
+             u32 *td,
+             int *err,
+             int *interrupting)
+{
+       if (rchan == NULL)
+               return NULL;
+       
+       *interrupting = 0;
+
+       return rchan->relay_ops->reserve(rchan, len, ts, td, err, interrupting);
+}
+
+
+/**
+ *     wakeup_readers - wake up VFS readers waiting on a channel
+ *     @private: the channel
+ *
+ *     This is the work function used to defer reader waking.  The
+ *     reason waking is deferred is that calling directly from commit
+ *     causes problems if you're writing from say the scheduler.
+ */
+static void 
+wakeup_readers(void *private)
+{
+       struct rchan *rchan = (struct rchan *)private;
+
+       wake_up_interruptible(&rchan->read_wait);
+}
+
+
+/**
+ *     relay_commit - commit a reserved slot in the buffer
+ *     @rchan: the channel
+ *     @from: commit the length starting here
+ *     @len: length committed
+ *     @interrupting: 1 if interrupting previous, used only in locking scheme
+ *
+ *      After the write into the reserved buffer has been complted, this
+ *      function must be called in order for the relay to determine whether 
+ *      buffers are complete and to wake up VFS readers.
+ *
+ *     delivery callback is triggered at this point if applicable.
+ */
+void
+relay_commit(struct rchan *rchan,
+            char *from,
+            u32 len,
+            int reserve_code,
+            int interrupting)
+{
+       int deliver;
+
+       if (rchan == NULL)
+               return;
+       
+       deliver = packet_delivery(rchan) || 
+                  (reserve_code & RELAY_BUFFER_SWITCH);
+
+       rchan->relay_ops->commit(rchan, from, len, deliver, interrupting);
+
+       /* The params are always the same, so no worry about re-queuing */
+       if (deliver &&  waitqueue_active(&rchan->read_wait)) {
+               PREPARE_WORK(&rchan->wake_readers, wakeup_readers, rchan);
+               schedule_delayed_work(&rchan->wake_readers, 1);
+       }
+}
+
+/**
+ *     relay_get_offset - get current and max channel buffer offsets
+ *     @rchan: the channel
+ *     @max_offset: maximum channel offset
+ *
+ *     Returns the current and maximum channel buffer offsets.
+ */
+u32
+relay_get_offset(struct rchan *rchan, u32 *max_offset)
+{
+       return rchan->relay_ops->get_offset(rchan, max_offset);
+}
+
+/**
+ *     reset_index - try once to reset the current channel index
+ *     @rchan: the channel
+ *     @old_index: the index read before reset
+ *
+ *     Attempts to reset the channel index to 0.  It tries once, and
+ *     if it fails, returns negative, 0 otherwise.
+ */
+int
+reset_index(struct rchan *rchan, u32 old_index)
+{
+       return rchan->relay_ops->reset_index(rchan, old_index);
+}
+
+/*
+ * close() vm_op implementation for relayfs file mapping.
+ */
+static void
+relay_file_mmap_close(struct vm_area_struct *vma)
+{
+       struct file *filp = vma->vm_file;
+       struct rchan_reader *reader;
+       struct rchan *rchan;
+
+       reader = (struct rchan_reader *)filp->private_data;
+       rchan = reader->rchan;
+
+       atomic_dec(&rchan->mapped);
+
+       rchan->callbacks->fileop_notify(reader->rchan->id, filp,
+                                       RELAY_FILE_UNMAP);
+}
+
+/*
+ * vm_ops for relay file mappings.
+ */
+static struct vm_operations_struct relay_file_mmap_ops = {
+       .close = relay_file_mmap_close
+};
+
+/* \begin{Code inspired from BTTV driver} */
+static inline unsigned long 
+kvirt_to_pa(unsigned long adr)
+{
+       unsigned long kva, ret;
+
+       kva = (unsigned long) page_address(vmalloc_to_page((void *) adr));
+       kva |= adr & (PAGE_SIZE - 1);
+       ret = __pa(kva);
+       return ret;
+}
+
+static int
+relay_mmap_region(struct vm_area_struct *vma,
+                 const char *adr,
+                 const char *start_pos,
+                 unsigned long size)
+{
+       unsigned long start = (unsigned long) adr;
+       unsigned long page, pos;
+
+       pos = (unsigned long) start_pos;
+
+       while (size > 0) {
+               page = kvirt_to_pa(pos);
+               if (remap_page_range(vma, start, page, PAGE_SIZE, PAGE_SHARED))
+                       return -EAGAIN;
+               start += PAGE_SIZE;
+               pos += PAGE_SIZE;
+               size -= PAGE_SIZE;
+       }
+
+       return 0;
+}
+/* \end{Code inspired from BTTV driver} */
+
+/**
+ *     relay_mmap_buffer: - mmap buffer to process address space
+ *     @rchan_id: relay channel id
+ *     @vma: vm_area_struct describing memory to be mapped
+ *
+ *     Returns:
+ *     0 if ok
+ *     -EAGAIN, when remap failed
+ *     -EINVAL, invalid requested length
+ *
+ *     Caller should already have grabbed mmap_sem.
+ */
+int 
+__relay_mmap_buffer(struct rchan *rchan,
+                   struct vm_area_struct *vma)
+{
+       int err = 0;
+       unsigned long length = vma->vm_end - vma->vm_start;
+       struct file *filp = vma->vm_file;
+
+       if (rchan == NULL) {
+               err = -EBADF;
+               goto exit;
+       }
+
+       if (rchan->init_buf) {
+               err = -EPERM;
+               goto exit;
+       }
+       
+       if (length != (unsigned long)rchan->alloc_size) {
+               err = -EINVAL;
+               goto exit;
+       }
+
+       err = relay_mmap_region(vma,
+                               (char *)vma->vm_start,
+                               rchan->buf,
+                               rchan->alloc_size);
+
+       if (err == 0) {
+               vma->vm_ops = &relay_file_mmap_ops;
+               err = rchan->callbacks->fileop_notify(rchan->id, filp,
+                                                     RELAY_FILE_MAP);
+               if (err == 0)
+                       atomic_inc(&rchan->mapped);
+       }
+exit:  
+       return err;
+}
+
+/*
+ * High-level relayfs kernel API.  See Documentation/filesystems/relafys.txt.
+ */
+
+/*
+ * rchan_callback implementations defining default channel behavior.  Used
+ * in place of corresponding NULL values in client callback struct.
+ */
+
+/*
+ * buffer_end() default callback.  Does nothing.
+ */
+static int 
+buffer_end_default_callback(int rchan_id,
+                           char *current_write_pos,
+                           char *end_of_buffer,
+                           struct timeval end_time,
+                           u32 end_tsc,
+                           int using_tsc) 
+{
+       return 0;
+}
+
+/*
+ * buffer_start() default callback.  Does nothing.
+ */
+static int 
+buffer_start_default_callback(int rchan_id,
+                             char *current_write_pos,
+                             u32 buffer_id,
+                             struct timeval start_time,
+                             u32 start_tsc,
+                             int using_tsc)
+{
+       return 0;
+}
+
+/*
+ * deliver() default callback.  Does nothing.
+ */
+static void 
+deliver_default_callback(int rchan_id, char *from, u32 len)
+{
+}
+
+/*
+ * user_deliver() default callback.  Does nothing.
+ */
+static void 
+user_deliver_default_callback(int rchan_id, char *from, u32 len)
+{
+}
+
+/*
+ * needs_resize() default callback.  Does nothing.
+ */
+static void
+needs_resize_default_callback(int rchan_id,
+                             int resize_type,
+                             u32 suggested_buf_size,
+                             u32 suggested_n_bufs)
+{
+}
+
+/*
+ * fileop_notify() default callback.  Does nothing.
+ */
+static int
+fileop_notify_default_callback(int rchan_id,
+                              struct file *filp,
+                              enum relay_fileop fileop)
+{
+       return 0;
+}
+
+/*
+ * ioctl() default callback.  Does nothing.
+ */
+static int
+ioctl_default_callback(int rchan_id,
+                      unsigned int cmd,
+                      unsigned long arg)
+{
+       return 0;
+}
+
+/* relay channel default callbacks */
+static struct rchan_callbacks default_channel_callbacks = {
+       .buffer_start = buffer_start_default_callback,
+       .buffer_end = buffer_end_default_callback,
+       .deliver = deliver_default_callback,
+       .user_deliver = user_deliver_default_callback,
+       .needs_resize = needs_resize_default_callback,
+       .fileop_notify = fileop_notify_default_callback,
+       .ioctl = ioctl_default_callback,
+};
+
+/**
+ *     check_attribute_flags - check sanity of channel attributes
+ *     @flags: channel attributes
+ *     @resizeable: 1 if true
+ *
+ *     Returns 0 if successful, negative otherwise.
+ */
+static int
+check_attribute_flags(u32 *attribute_flags, int resizeable)
+{
+       u32 flags = *attribute_flags;
+       
+       if (!(flags & RELAY_DELIVERY_BULK) && !(flags & RELAY_DELIVERY_PACKET))
+               return -EINVAL; /* Delivery mode must be specified */
+       
+       if (!(flags & RELAY_USAGE_SMP) && !(flags & RELAY_USAGE_GLOBAL))
+               return -EINVAL; /* Usage must be specified */
+       
+       if (resizeable) {  /* Resizeable can never be continuous */
+               *attribute_flags &= ~RELAY_MODE_CONTINUOUS;
+               *attribute_flags |= RELAY_MODE_NO_OVERWRITE;
+       }
+       
+       if ((flags & RELAY_MODE_CONTINUOUS) &&
+           (flags & RELAY_MODE_NO_OVERWRITE))
+               return -EINVAL; /* Can't have it both ways */
+       
+       if (!(flags & RELAY_MODE_CONTINUOUS) &&
+           !(flags & RELAY_MODE_NO_OVERWRITE))
+               *attribute_flags |= RELAY_MODE_CONTINUOUS; /* Default to continuous */
+       
+       if (!(flags & RELAY_SCHEME_ANY))
+               return -EINVAL; /* One or both must be specified */
+       else if (flags & RELAY_SCHEME_LOCKLESS) {
+               if (have_cmpxchg())
+                       *attribute_flags &= ~RELAY_SCHEME_LOCKING;
+               else if (flags & RELAY_SCHEME_LOCKING)
+                       *attribute_flags &= ~RELAY_SCHEME_LOCKLESS;
+               else
+                       return -EINVAL; /* Locking scheme not an alternative */
+       }
+       
+       if (!(flags & RELAY_TIMESTAMP_ANY))
+               return -EINVAL; /* One or both must be specified */
+       else if (flags & RELAY_TIMESTAMP_TSC) {
+               if (have_tsc())
+                       *attribute_flags &= ~RELAY_TIMESTAMP_GETTIMEOFDAY;
+               else if (flags & RELAY_TIMESTAMP_GETTIMEOFDAY)
+                       *attribute_flags &= ~RELAY_TIMESTAMP_TSC;
+               else
+                       return -EINVAL; /* gettimeofday not an alternative */
+       }
+
+       return 0;
+}
+
+/*
+ * High-level API functions.
+ */
+
+/**
+ *     __relay_reset - internal reset function
+ *     @rchan: the channel
+ *     @init: 1 if this is a first-time channel initialization
+ *
+ *     See relay_reset for description of effect.
+ */
+void
+__relay_reset(struct rchan *rchan, int init)
+{
+       int i;
+       
+       if (init) {
+               rchan->version = RELAYFS_CHANNEL_VERSION;
+               init_MUTEX(&rchan->resize_sem);
+               init_waitqueue_head(&rchan->read_wait);
+               init_waitqueue_head(&rchan->write_wait);
+               atomic_set(&rchan->refcount, 0);
+               INIT_LIST_HEAD(&rchan->open_readers);
+               rchan->open_readers_lock = RW_LOCK_UNLOCKED;
+       }
+       
+       rchan->buf_id = rchan->buf_idx = 0;
+       atomic_set(&rchan->suspended, 0);
+       atomic_set(&rchan->mapped, 0);
+       rchan->half_switch = 0;
+       rchan->bufs_produced = 0;
+       rchan->bufs_consumed = 0;
+       rchan->bytes_consumed = 0;
+       rchan->initialized = 0;
+       rchan->finalized = 0;
+       rchan->resize_min = rchan->resize_max = 0;
+       rchan->resizing = 0;
+       rchan->replace_buffer = 0;
+       rchan->resize_buf = NULL;
+       rchan->resize_buf_size = 0;
+       rchan->resize_alloc_size = 0;
+       rchan->resize_n_bufs = 0;
+       rchan->resize_err = 0;
+       rchan->resize_failures = 0;
+       rchan->resize_order = 0;
+
+       rchan->expand_page_array = NULL;
+       rchan->expand_page_count = 0;
+       rchan->shrink_page_array = NULL;
+       rchan->shrink_page_count = 0;
+       rchan->resize_page_array = NULL;
+       rchan->resize_page_count = 0;
+       rchan->old_buf_page_array = NULL;
+       rchan->expand_buf_id = 0;
+
+       INIT_WORK(&rchan->wake_readers, NULL, NULL);
+       INIT_WORK(&rchan->wake_writers, NULL, NULL);
+
+       for (i = 0; i < RELAY_MAX_BUFS; i++)
+               rchan->unused_bytes[i] = 0;
+       
+       rchan->relay_ops->reset(rchan, init);
+}
+
+/**
+ *     relay_reset - reset the channel
+ *     @rchan: the channel
+ *
+ *     Returns 0 if successful, negative if not.
+ *
+ *     This has the effect of erasing all data from the buffer and
+ *     restarting the channel in its initial state.  The buffer itself
+ *     is not freed, so any mappings are still in effect.
+ *
+ *     NOTE: Care should be taken that the channnel isn't actually
+ *     being used by anything when this call is made.
+ */
+int
+relay_reset(int rchan_id)
+{
+       struct rchan *rchan;
+
+       rchan = rchan_get(rchan_id);
+       if (rchan == NULL)
+               return -EBADF;
+
+       __relay_reset(rchan, 0);
+       update_readers_consumed(rchan, 0, 0);
+
+       rchan_put(rchan);
+
+       return 0;
+}
+
+/**
+ *     check_init_buf - check the sanity of init_buf, if present
+ *     @init_buf: the initbuf
+ *     @init_buf_size: the total initbuf size
+ *     @bufsize: the channel's sub-buffer size
+ *     @nbufs: the number of sub-buffers in the channel
+ *
+ *     Returns 0 if ok, negative otherwise.
+ */
+static int
+check_init_buf(char *init_buf, u32 init_buf_size, u32 bufsize, u32 nbufs)
+{
+       int err = 0;
+       
+       if (init_buf && nbufs == 1) /* 1 sub-buffer makes no sense */
+               err = -EINVAL;
+
+       if (init_buf && (bufsize * nbufs != init_buf_size))
+               err = -EINVAL;
+
+       return err;
+}
+
+/**
+ *     rchan_create_buf - allocate the initial channel buffer
+ *     @rchan: the channel
+ *     @size_alloc: the total size of the channel buffer
+ *
+ *     Returns 0 if successful, negative otherwise.
+ */
+static inline int
+rchan_create_buf(struct rchan *rchan, int size_alloc)
+{
+       struct page **page_array;
+       int page_count;
+
+       if ((rchan->buf = (char *)alloc_rchan_buf(size_alloc, &page_array, &page_count)) == NULL) {
+               rchan->buf_page_array = NULL;
+               rchan->buf_page_count = 0;
+               return -ENOMEM;
+       }
+
+       rchan->buf_page_array = page_array;
+       rchan->buf_page_count = page_count;
+
+       return 0;
+}
+
+/**
+ *     rchan_create - allocate and initialize a channel, including buffer
+ *     @chanpath: path specifying the relayfs channel file to create
+ *     @bufsize: the size of the sub-buffers within the channel buffer
+ *     @nbufs: the number of sub-buffers within the channel buffer
+ *     @rchan_flags: flags specifying buffer attributes
+ *     @err: err code
+ *
+ *     Returns channel if successful, NULL otherwise, err receives errcode.
+ *
+ *     Allocates a struct rchan representing a relay channel, according
+ *     to the attributes passed in via rchan_flags.  Does some basic sanity
+ *     checking but doesn't try to do anything smart.  In particular, the
+ *     number of buffers must be a power of 2, and if the lockless scheme
+ *     is being used, the sub-buffer size must also be a power of 2.  The
+ *     locking scheme can use buffers of any size.
+ */
+static struct rchan *
+rchan_create(const char *chanpath, 
+            int bufsize, 
+            int nbufs, 
+            u32 rchan_flags,
+            char *init_buf,
+            u32 init_buf_size,
+            int *err)
+{
+       int size_alloc;
+       struct rchan *rchan = NULL;
+
+       *err = 0;
+
+       rchan = (struct rchan *)kmalloc(sizeof(struct rchan), GFP_KERNEL);
+       if (rchan == NULL) {
+               *err = -ENOMEM;
+               return NULL;
+       }
+       rchan->buf = rchan->init_buf = NULL;
+
+       *err = check_init_buf(init_buf, init_buf_size, bufsize, nbufs);
+       if (*err)
+               goto exit;
+       
+       if (nbufs == 1 && bufsize) {
+               rchan->n_bufs = nbufs;
+               rchan->buf_size = bufsize;
+               size_alloc = bufsize;
+               goto alloc;
+       }
+       
+       if (bufsize <= 0 ||
+           (rchan_flags & RELAY_SCHEME_LOCKLESS && hweight32(bufsize) != 1) ||
+           hweight32(nbufs) != 1 ||
+           nbufs < RELAY_MIN_BUFS ||
+           nbufs > RELAY_MAX_BUFS) {
+               *err = -EINVAL;
+               goto exit;
+       }
+
+       size_alloc = FIX_SIZE(bufsize * nbufs);
+       if (size_alloc > RELAY_MAX_BUF_SIZE) {
+               *err = -EINVAL;
+               goto exit;
+       }
+       rchan->n_bufs = nbufs;
+       rchan->buf_size = bufsize;
+
+       if (rchan_flags & RELAY_SCHEME_LOCKLESS) {
+               offset_bits(rchan) = ffs(bufsize) - 1;
+               offset_mask(rchan) =  RELAY_BUF_OFFSET_MASK(offset_bits(rchan));
+               bufno_bits(rchan) = ffs(nbufs) - 1;
+       }
+alloc:
+       if (rchan_alloc_id(rchan) == -1) {
+               *err = -ENOMEM;
+               goto exit;
+       }
+
+       if (init_buf == NULL) {
+               *err = rchan_create_buf(rchan, size_alloc);
+               if (*err) {
+                       rchan_free_id(rchan->id);
+                       goto exit;
+               }
+       } else
+               rchan->buf = rchan->init_buf = init_buf;
+       
+       rchan->alloc_size = size_alloc;
+
+       if (rchan_flags & RELAY_SCHEME_LOCKLESS)
+               rchan->relay_ops = &lockless_ops;
+       else
+               rchan->relay_ops = &locking_ops;
+
+exit:
+       if (*err) {
+               kfree(rchan);
+               rchan = NULL;
+       }
+
+       return rchan;
+}
+
+
+static char tmpname[NAME_MAX];
+
+/**
+ *     rchan_create_dir - create directory for file
+ *     @chanpath: path to file, including filename
+ *     @residual: filename remaining after parse
+ *     @topdir: the directory filename should be created in
+ *
+ *     Returns 0 if successful, negative otherwise.
+ *
+ *     Inspired by xlate_proc_name() in procfs.  Given a file path which
+ *     includes the filename, creates any and all directories necessary 
+ *     to create the file.
+ */
+static int 
+rchan_create_dir(const char * chanpath, 
+                const char **residual, 
+                struct dentry **topdir)
+{
+       const char *cp = chanpath, *next;
+       struct dentry *parent = NULL;
+       int len, err = 0;
+       
+       while (1) {
+               next = strchr(cp, '/');
+               if (!next)
+                       break;
+
+               len = next - cp;
+
+               strncpy(tmpname, cp, len);
+               tmpname[len] = '\0';
+               err = relayfs_create_dir(tmpname, parent, &parent);
+               if (err && (err != -EEXIST))
+                       return err;
+               cp += len + 1;
+       }
+
+       *residual = cp;
+       *topdir = parent;
+
+       return err;
+}
+
+/**
+ *     rchan_create_file - create file, including parent directories
+ *     @chanpath: path to file, including filename
+ *     @dentry: result dentry
+ *     @data: data to associate with the file
+ *
+ *     Returns 0 if successful, negative otherwise.
+ */
+static int 
+rchan_create_file(const char * chanpath, 
+                 struct dentry **dentry, 
+                 struct rchan * data,
+                 int mode)
+{
+       int err;
+       const char * fname;
+       struct dentry *topdir;
+
+       err = rchan_create_dir(chanpath, &fname, &topdir);
+       if (err && (err != -EEXIST))
+               return err;
+
+       err = relayfs_create_file(fname, topdir, dentry, (void *)data, mode);
+
+       return err;
+}
+
+/**
+ *     relay_open - create a new file/channel buffer in relayfs
+ *     @chanpath: name of file to create, including path
+ *     @bufsize: size of sub-buffers
+ *     @nbufs: number of sub-buffers
+ *     @flags: channel attributes
+ *     @callbacks: client callback functions
+ *     @start_reserve: number of bytes to reserve at start of each sub-buffer
+ *     @end_reserve: number of bytes to reserve at end of each sub-buffer
+ *     @rchan_start_reserve: additional reserve at start of first sub-buffer
+ *     @resize_min: minimum total buffer size, if set
+ *     @resize_max: maximum total buffer size, if set
+ *     @mode: the perms to be given to the relayfs file, 0 to accept defaults
+ *     @init_buf: initial memory buffer to start out with, NULL if N/A
+ *     @init_buf_size: initial memory buffer size to start out with, 0 if N/A
+ *
+ *     Returns channel id if successful, negative otherwise.
+ *
+ *     Creates a relay channel using the sizes and attributes specified.
+ *     The default permissions, used if mode == 0 are S_IRUSR | S_IWUSR.  See
+ *     Documentation/filesystems/relayfs.txt for details.
+ */
+int
+relay_open(const char *chanpath,
+          int bufsize,
+          int nbufs,
+          u32 flags,
+          struct rchan_callbacks *channel_callbacks,
+          u32 start_reserve,
+          u32 end_reserve,
+          u32 rchan_start_reserve,
+          u32 resize_min,
+          u32 resize_max,
+          int mode,
+          char *init_buf,
+          u32 init_buf_size)
+{
+       int err;
+       struct rchan *rchan;
+       struct dentry *dentry;
+       struct rchan_callbacks *callbacks = NULL;
+
+       if (chanpath == NULL)
+               return -EINVAL;
+
+       if (nbufs != 1) {
+               err = check_attribute_flags(&flags, resize_min ? 1 : 0);
+               if (err)
+                       return err;
+       }
+
+       rchan = rchan_create(chanpath, bufsize, nbufs, flags, init_buf, init_buf_size, &err);
+
+       if (err < 0)
+               return err;
+
+       /* Create file in fs */
+       if ((err = rchan_create_file(chanpath, &dentry, rchan, mode)) < 0) {
+               rchan_destroy_buf(rchan);
+               rchan_free_id(rchan->id);
+               kfree(rchan);
+               return err;
+       }
+
+       rchan->dentry = dentry;
+
+       if (channel_callbacks == NULL)
+               callbacks = &default_channel_callbacks;
+       else
+               callbacks = channel_callbacks;
+
+       if (callbacks->buffer_end == NULL)
+               callbacks->buffer_end = buffer_end_default_callback;
+       if (callbacks->buffer_start == NULL)
+               callbacks->buffer_start = buffer_start_default_callback;
+       if (callbacks->deliver == NULL)
+               callbacks->deliver = deliver_default_callback;
+       if (callbacks->user_deliver == NULL)
+               callbacks->user_deliver = user_deliver_default_callback;
+       if (callbacks->needs_resize == NULL)
+               callbacks->needs_resize = needs_resize_default_callback;
+       if (callbacks->fileop_notify == NULL)
+               callbacks->fileop_notify = fileop_notify_default_callback;
+       if (callbacks->ioctl == NULL)
+               callbacks->ioctl = ioctl_default_callback;
+       rchan->callbacks = callbacks;
+
+       /* Just to let the client know the sizes used */
+       rchan->callbacks->needs_resize(rchan->id,
+                                      RELAY_RESIZE_REPLACED,
+                                      rchan->buf_size,
+                                      rchan->n_bufs);
+
+       rchan->flags = flags;
+       rchan->start_reserve = start_reserve;
+       rchan->end_reserve = end_reserve;
+       rchan->rchan_start_reserve = rchan_start_reserve;
+
+       __relay_reset(rchan, 1);
+
+       if (resize_min > 0 && resize_max > 0 && 
+          resize_max < RELAY_MAX_TOTAL_BUF_SIZE) {
+               rchan->resize_min = resize_min;
+               rchan->resize_max = resize_max;
+               init_shrink_timer(rchan);
+       }
+
+       rchan_get(rchan->id);
+
+       return rchan->id;
+}
+
+/**
+ *     relay_discard_init_buf - alloc channel buffer and copy init_buf into it
+ *     @rchan_id: the channel id
+ *
+ *     Returns 0 if successful, negative otherwise.
+ *
+ *     NOTE: May sleep.  Should also be called only when the channel isn't
+ *     actively being written into.
+ */
+int
+relay_discard_init_buf(int rchan_id)
+{
+       struct rchan *rchan;
+       int err = 0;
+       
+       rchan = rchan_get(rchan_id);
+       if (rchan == NULL)
+               return -EBADF;
+
+       if (rchan->init_buf == NULL) {
+               err = -EINVAL;
+               goto out;
+       }
+       
+       err = rchan_create_buf(rchan, rchan->alloc_size);
+       if (err)
+               goto out;
+       
+       memcpy(rchan->buf, rchan->init_buf, rchan->n_bufs * rchan->buf_size);
+       rchan->init_buf = NULL;
+out:
+       rchan_put(rchan);
+       
+       return err;
+}
+
+/**
+ *     relay_finalize - perform end-of-buffer processing for last buffer
+ *     @rchan_id: the channel id
+ *     @releasing: true if called when releasing file
+ *
+ *     Returns 0 if successful, negative otherwise.
+ */
+static int 
+relay_finalize(int rchan_id)
+{
+       struct rchan *rchan = rchan_get(rchan_id);
+       if (rchan == NULL)
+               return -EBADF;
+
+       if (rchan->finalized == 0) {
+               rchan->relay_ops->finalize(rchan);
+               rchan->finalized = 1;
+       }
+
+       if (waitqueue_active(&rchan->read_wait)) {
+               PREPARE_WORK(&rchan->wake_readers, wakeup_readers, rchan);
+               schedule_delayed_work(&rchan->wake_readers, 1);
+       }
+
+       rchan_put(rchan);
+
+       return 0;
+}
+
+/**
+ *     restore_callbacks - restore default channel callbacks
+ *     @rchan: the channel
+ *
+ *     Restore callbacks to the default versions.
+ */
+static inline void
+restore_callbacks(struct rchan *rchan)
+{
+       if (rchan->callbacks != &default_channel_callbacks)
+               rchan->callbacks = &default_channel_callbacks;
+}
+
+/**
+ *     relay_close - close the channel
+ *     @rchan_id: relay channel id
+ *     
+ *     Finalizes the last sub-buffer and marks the channel as finalized.
+ *     The channel buffer and channel data structure are then freed
+ *     automatically when the last reference to the channel is given up.
+ */
+int 
+relay_close(int rchan_id)
+{
+       int err;
+       struct rchan *rchan;
+
+       if ((rchan_id < 0) || (rchan_id >= RELAY_MAX_CHANNELS))
+               return -EBADF;
+
+       err = relay_finalize(rchan_id);
+
+       if (!err) {
+               read_lock(&rchan_table_lock);
+               rchan = rchan_table[rchan_id];
+               read_unlock(&rchan_table_lock);
+
+               if (rchan) {
+                       restore_callbacks(rchan);
+                       if (rchan->resize_min)
+                               del_timer(&rchan->shrink_timer);
+                       rchan_put(rchan);
+               }
+       }
+       
+       return err;
+}
+
+/**
+ *     relay_write - reserve a slot in the channel and write data into it
+ *     @rchan_id: relay channel id
+ *     @data_ptr: data to be written into reserved slot
+ *     @count: number of bytes to write
+ *     @td_offset: optional offset where time delta should be written
+ *     @wrote_pos: optional ptr returning buf pos written to, ignored if NULL 
+ *
+ *     Returns the number of bytes written, 0 or negative on failure.
+ *
+ *     Reserves space in the channel and writes count bytes of data_ptr
+ *     to it.  Automatically performs any necessary locking, depending
+ *     on the scheme and SMP usage in effect (no locking is done for the
+ *     lockless scheme regardless of usage). 
+ *
+ *     If td_offset is >= 0, the internal time delta calculated when
+ *     slot was reserved will be written at that offset.
+ *
+ *     If wrote_pos is non-NULL, it will receive the location the data
+ *     was written to, which may be needed for some applications but is not
+ *     normally interesting.
+ */
+int
+relay_write(int rchan_id, 
+           const void *data_ptr, 
+           size_t count,
+           int td_offset,
+           void **wrote_pos)
+{
+       unsigned long flags;
+       char *reserved, *write_pos;
+       int bytes_written = 0;
+       int reserve_code, interrupting;
+       struct timeval ts;
+       u32 td;
+       struct rchan *rchan;
+       
+       rchan = rchan_get(rchan_id);
+       if (rchan == NULL)
+               return -EBADF;
+
+       relay_lock_channel(rchan, flags); /* nop for lockless */
+
+       write_pos = reserved = relay_reserve(rchan, count, &ts, &td, 
+                                            &reserve_code, &interrupting);
+
+       if (reserved != NULL) {
+               relay_write_direct(write_pos, data_ptr, count);
+               if ((td_offset >= 0) && (td_offset < count - sizeof(td)))
+                       *((u32 *)(reserved + td_offset)) = td;
+               bytes_written = count;
+       } else if (reserve_code == RELAY_WRITE_TOO_LONG)
+               bytes_written = -EINVAL;
+
+       if (bytes_written > 0)
+               relay_commit(rchan, reserved, bytes_written, reserve_code, interrupting);
+
+       relay_unlock_channel(rchan, flags); /* nop for lockless */
+
+       rchan_put(rchan);
+
+       if (wrote_pos)
+               *wrote_pos = reserved;
+       
+       return bytes_written;
+}
+
+/**
+ *     wakeup_writers - wake up VFS writers waiting on a channel
+ *     @private: the channel
+ *
+ *     This is the work function used to defer writer waking.  The
+ *     reason waking is deferred is that calling directly from 
+ *     buffers_consumed causes problems if you're writing from say 
+ *     the scheduler.
+ */
+static void 
+wakeup_writers(void *private)
+{
+       struct rchan *rchan = (struct rchan *)private;
+       
+       wake_up_interruptible(&rchan->write_wait);
+}
+
+
+/**
+ *     __relay_buffers_consumed - internal version of relay_buffers_consumed
+ *     @rchan: the relay channel
+ *     @bufs_consumed: number of buffers to add to current count for channel
+ *     
+ *     Internal - updates the channel's consumed buffer count.
+ */
+static void
+__relay_buffers_consumed(struct rchan *rchan, u32 bufs_consumed)
+{
+       rchan->bufs_consumed += bufs_consumed;
+       
+       if (rchan->bufs_consumed > rchan->bufs_produced)
+               rchan->bufs_consumed = rchan->bufs_produced;
+       
+       atomic_set(&rchan->suspended, 0);
+
+       PREPARE_WORK(&rchan->wake_writers, wakeup_writers, rchan);
+       schedule_delayed_work(&rchan->wake_writers, 1);
+}
+
+/**
+ *     __reader_buffers_consumed - update reader/channel consumed buffer count
+ *     @reader: channel reader
+ *     @bufs_consumed: number of buffers to add to current count for channel
+ *     
+ *     Internal - updates the reader's consumed buffer count.  If the reader's
+ *     resulting total is greater than the channel's, update the channel's.
+*/
+static void
+__reader_buffers_consumed(struct rchan_reader *reader, u32 bufs_consumed)
+{
+       reader->bufs_consumed += bufs_consumed;
+       
+       if (reader->bufs_consumed > reader->rchan->bufs_consumed)
+               __relay_buffers_consumed(reader->rchan, bufs_consumed);
+}
+
+/**
+ *     relay_buffers_consumed - add to the # buffers consumed for the channel
+ *     @reader: channel reader
+ *     @bufs_consumed: number of buffers to add to current count for channel
+ *     
+ *     Adds to the channel's consumed buffer count.  buffers_consumed should
+ *     be the number of buffers newly consumed, not the total number consumed.
+ *
+ *     NOTE: kernel clients don't need to call this function if the reader
+ *     is auto-consuming or the channel is MODE_CONTINUOUS.
+ */
+void 
+relay_buffers_consumed(struct rchan_reader *reader, u32 bufs_consumed)
+{
+       if (reader && reader->rchan)
+               __reader_buffers_consumed(reader, bufs_consumed);
+}
+
+/**
+ *     __relay_bytes_consumed - internal version of relay_bytes_consumed 
+ *     @rchan: the relay channel
+ *     @bytes_consumed: number of bytes to add to current count for channel
+ *     @read_offset: where the bytes were consumed from
+ *     
+ *     Internal - updates the channel's consumed count.
+*/
+static void
+__relay_bytes_consumed(struct rchan *rchan, u32 bytes_consumed, u32 read_offset)
+{
+       u32 consuming_idx;
+       u32 unused;
+
+       consuming_idx = read_offset / rchan->buf_size;
+
+       if (consuming_idx >= rchan->n_bufs)
+               consuming_idx = rchan->n_bufs - 1;
+       rchan->bytes_consumed += bytes_consumed;
+
+       unused = rchan->unused_bytes[consuming_idx];
+       
+       if (rchan->bytes_consumed + unused >= rchan->buf_size) {
+               __relay_buffers_consumed(rchan, 1);
+               rchan->bytes_consumed = 0;
+       }
+}
+
+/**
+ *     __reader_bytes_consumed - update reader/channel consumed count
+ *     @reader: channel reader
+ *     @bytes_consumed: number of bytes to add to current count for channel
+ *     @read_offset: where the bytes were consumed from
+ *     
+ *     Internal - updates the reader's consumed count.  If the reader's
+ *     resulting total is greater than the channel's, update the channel's.
+*/
+static void
+__reader_bytes_consumed(struct rchan_reader *reader, u32 bytes_consumed, u32 read_offset)
+{
+       u32 consuming_idx;
+       u32 unused;
+
+       consuming_idx = read_offset / reader->rchan->buf_size;
+
+       if (consuming_idx >= reader->rchan->n_bufs)
+               consuming_idx = reader->rchan->n_bufs - 1;
+
+       reader->bytes_consumed += bytes_consumed;
+       
+       unused = reader->rchan->unused_bytes[consuming_idx];
+       
+       if (reader->bytes_consumed + unused >= reader->rchan->buf_size) {
+               reader->bufs_consumed++;
+               reader->bytes_consumed = 0;
+       }
+
+       if ((reader->bufs_consumed > reader->rchan->bufs_consumed) ||
+           ((reader->bufs_consumed == reader->rchan->bufs_consumed) &&
+            (reader->bytes_consumed > reader->rchan->bytes_consumed)))
+               __relay_bytes_consumed(reader->rchan, bytes_consumed, read_offset);
+}
+
+/**
+ *     relay_bytes_consumed - add to the # bytes consumed for the channel
+ *     @reader: channel reader
+ *     @bytes_consumed: number of bytes to add to current count for channel
+ *     @read_offset: where the bytes were consumed from
+ *     
+ *     Adds to the channel's consumed count.  bytes_consumed should be the
+ *     number of bytes actually read e.g. return value of relay_read() and
+ *     the read_offset should be the actual offset the bytes were read from
+ *     e.g. the actual_read_offset set by relay_read(). See
+ *     Documentation/filesystems/relayfs.txt for more details.
+ *
+ *     NOTE: kernel clients don't need to call this function if the reader
+ *     is auto-consuming or the channel is MODE_CONTINUOUS.
+ */
+void
+relay_bytes_consumed(struct rchan_reader *reader, u32 bytes_consumed, u32 read_offset)
+{
+       if (reader && reader->rchan)
+               __reader_bytes_consumed(reader, bytes_consumed, read_offset);
+}
+
+/**
+ *     update_readers_consumed - apply offset change to reader
+ *     @rchan: the channel
+ *
+ *     Apply the consumed counts to all readers open on the channel.
+ */
+void
+update_readers_consumed(struct rchan *rchan, u32 bufs_consumed, u32 bytes_consumed)
+{
+       struct list_head *p;
+       struct rchan_reader *reader;
+       
+       read_lock(&rchan->open_readers_lock);
+       list_for_each(p, &rchan->open_readers) {
+               reader = list_entry(p, struct rchan_reader, list);
+               reader->bufs_consumed = bufs_consumed;
+               reader->bytes_consumed = bytes_consumed;
+               if (reader->vfs_reader) 
+                       reader->pos.file->f_pos = 0;
+               else
+                       reader->pos.f_pos = 0;
+               reader->offset_changed = 1;
+       }
+       read_unlock(&rchan->open_readers_lock);
+}
+
+/**
+ *     do_read - utility function to do the actual read to user
+ *     @rchan: the channel
+ *     @buf: user buf to read into, NULL if just getting info
+ *     @count: bytes requested
+ *     @read_offset: offset into channel
+ *     @new_offset: new offset into channel after read
+ *     @actual_read_offset: read offset actually used
+ *
+ *     Returns the number of bytes read, 0 if none.
+ */
+static ssize_t
+do_read(struct rchan *rchan, char *buf, size_t count, u32 read_offset, u32 *new_offset, u32 *actual_read_offset)
+{
+       u32 read_bufno, cur_bufno;
+       u32 avail_offset, cur_idx, max_offset, buf_end_offset;
+       u32 avail_count, buf_size;
+       int unused_bytes = 0;
+       size_t read_count = 0;
+       u32 last_buf_byte_offset;
+
+       *actual_read_offset = read_offset;
+       
+       buf_size = rchan->buf_size;
+       if (unlikely(!buf_size)) BUG();
+
+       read_bufno = read_offset / buf_size;
+       if (unlikely(read_bufno >= RELAY_MAX_BUFS)) BUG();
+       unused_bytes = rchan->unused_bytes[read_bufno];
+
+       avail_offset = cur_idx = relay_get_offset(rchan, &max_offset);
+
+       if (cur_idx == read_offset) {
+               if (atomic_read(&rchan->suspended) == 1) {
+                       read_offset += 1;
+                       if (read_offset >= max_offset)
+                               read_offset = 0;
+                       *actual_read_offset = read_offset;
+               } else {
+                       *new_offset = read_offset;
+                       return 0;
+               }
+       } else {
+               last_buf_byte_offset = (read_bufno + 1) * buf_size - 1;
+               if (read_offset == last_buf_byte_offset) {
+                       if (unused_bytes != 1) {
+                               read_offset += 1;
+                               if (read_offset >= max_offset)
+                                       read_offset = 0;
+                               *actual_read_offset = read_offset;
+                       }
+               }
+       }
+
+       read_bufno = read_offset / buf_size;
+       if (unlikely(read_bufno >= RELAY_MAX_BUFS)) BUG();
+       unused_bytes = rchan->unused_bytes[read_bufno];
+
+       cur_bufno = cur_idx / buf_size;
+
+       buf_end_offset = (read_bufno + 1) * buf_size - unused_bytes;
+       if (avail_offset > buf_end_offset)
+               avail_offset = buf_end_offset;
+       else if (avail_offset < read_offset)
+               avail_offset = buf_end_offset;
+       avail_count = avail_offset - read_offset;
+       read_count = avail_count >= count ? count : avail_count;
+
+       if (read_count && buf != NULL)
+               if (copy_to_user(buf, rchan->buf + read_offset, read_count))
+                       return -EFAULT;
+
+       if (read_bufno == cur_bufno)
+               if (read_count && (read_offset + read_count >= buf_end_offset) && (read_offset + read_count <= cur_idx)) {
+                       *new_offset = cur_idx;
+                       return read_count;
+               }
+
+       if (read_offset + read_count + unused_bytes > max_offset)
+               *new_offset = 0;
+       else if (read_offset + read_count >= buf_end_offset)
+               *new_offset = read_offset + read_count + unused_bytes;
+       else
+               *new_offset = read_offset + read_count;
+
+       return read_count;
+}
+
+/**
+ *     __relay_read - read bytes from channel, relative to current reader pos
+ *     @reader: channel reader
+ *     @buf: user buf to read into, NULL if just getting info
+ *     @count: bytes requested
+ *     @read_offset: offset into channel
+ *     @new_offset: new offset into channel after read
+ *     @actual_read_offset: read offset actually used
+ *     @wait: if non-zero, wait for something to read
+ *
+ *     Internal - see relay_read() for details.
+ *
+ *     Returns the number of bytes read, 0 if none, negative on failure.
+ */
+static ssize_t
+__relay_read(struct rchan_reader *reader, char *buf, size_t count, u32 read_offset, u32 *new_offset, u32 *actual_read_offset, int wait)
+{
+       int err = 0;
+       size_t read_count = 0;
+       struct rchan *rchan = reader->rchan;
+
+       if (!wait && !rchan->initialized)
+               return -EAGAIN;
+
+       if (using_lockless(rchan))
+               read_offset &= idx_mask(rchan);
+
+       if (read_offset >= rchan->n_bufs * rchan->buf_size) {
+               *new_offset = 0;
+               if (!wait)
+                       return -EAGAIN;
+               else
+                       return -EINTR;
+       }
+       
+       if (buf != NULL && wait) {
+               err = wait_event_interruptible(rchan->read_wait,
+                      ((rchan->finalized == 1) ||
+                       (atomic_read(&rchan->suspended) == 1) ||
+                       (relay_get_offset(rchan, NULL) != read_offset)));
+
+               if (rchan->finalized)
+                       return 0;
+
+               if (reader->offset_changed) {
+                       reader->offset_changed = 0;
+                       return -EINTR;
+               }
+               
+               if (err)
+                       return err;
+       }
+
+       read_count = do_read(rchan, buf, count, read_offset, new_offset, actual_read_offset);
+
+       if (read_count < 0)
+               err = read_count;
+       
+       if (err)
+               return err;
+       else
+               return read_count;
+}
+
+/**
+ *     relay_read - read bytes from channel, relative to current reader pos
+ *     @reader: channel reader
+ *     @buf: user buf to read into, NULL if just getting info
+ *     @count: bytes requested
+ *     @wait: if non-zero, wait for something to read
+ *     @actual_read_offset: set read offset actually used, must not be NULL
+ *
+ *     Reads count bytes from the channel, or as much as is available within
+ *     the sub-buffer currently being read.  The read offset that will be
+ *     read from is the position contained within the reader object.  If the
+ *     wait flag is set, buf is non-NULL, and there is nothing available,
+ *     it will wait until there is.  If the wait flag is 0 and there is
+ *     nothing available, -EAGAIN is returned.  If buf is NULL, the value
+ *     returned is the number of bytes that would have been read.
+ *     actual_read_offset is the value that should be passed as the read
+ *     offset to relay_bytes_consumed, needed only if the reader is not
+ *     auto-consuming and the channel is MODE_NO_OVERWRITE, but in any case,
+ *     it must not be NULL.  See Documentation/filesystems/relayfs.txt for
+ *     more details.
+ */
+ssize_t
+relay_read(struct rchan_reader *reader, char *buf, size_t count, int wait, u32 *actual_read_offset)
+{
+       u32 new_offset;
+       u32 read_offset;
+       ssize_t read_count;
+       
+       if (reader == NULL || reader->rchan == NULL)
+               return -EBADF;
+
+       if (actual_read_offset == NULL)
+               return -EINVAL;
+
+       if (reader->vfs_reader)
+               read_offset = (u32)(reader->pos.file->f_pos);
+       else
+               read_offset = reader->pos.f_pos;
+       *actual_read_offset = read_offset;
+       
+       read_count = __relay_read(reader, buf, count, read_offset,
+                                 &new_offset, actual_read_offset, wait);
+
+       if (read_count < 0)
+               return read_count;
+
+       if (reader->vfs_reader)
+               reader->pos.file->f_pos = new_offset;
+       else
+               reader->pos.f_pos = new_offset;
+
+       if (reader->auto_consume && ((read_count) || (new_offset != read_offset)))
+               __reader_bytes_consumed(reader, read_count, *actual_read_offset);
+
+       if (read_count == 0 && !wait)
+               return -EAGAIN;
+       
+       return read_count;
+}
+
+/**
+ *     relay_bytes_avail - number of bytes available in current sub-buffer
+ *     @reader: channel reader
+ *     
+ *     Returns the number of bytes available relative to the reader's
+ *     current read position within the corresponding sub-buffer, 0 if
+ *     there is nothing available.  See Documentation/filesystems/relayfs.txt
+ *     for more details.
+ */
+ssize_t
+relay_bytes_avail(struct rchan_reader *reader)
+{
+       u32 f_pos;
+       u32 new_offset;
+       u32 actual_read_offset;
+       ssize_t bytes_read;
+       
+       if (reader == NULL || reader->rchan == NULL)
+               return -EBADF;
+       
+       if (reader->vfs_reader)
+               f_pos = (u32)reader->pos.file->f_pos;
+       else
+               f_pos = reader->pos.f_pos;
+       new_offset = f_pos;
+
+       bytes_read = __relay_read(reader, NULL, reader->rchan->buf_size,
+                                 f_pos, &new_offset, &actual_read_offset, 0);
+
+       if ((new_offset != f_pos) &&
+           ((bytes_read == -EINTR) || (bytes_read == 0)))
+               bytes_read = -EAGAIN;
+       else if ((bytes_read < 0) && (bytes_read != -EAGAIN))
+               bytes_read = 0;
+
+       return bytes_read;
+}
+
+/**
+ *     rchan_empty - boolean, is the channel empty wrt reader?
+ *     @reader: channel reader
+ *     
+ *     Returns 1 if the channel is empty, 0 otherwise.
+ */
+int
+rchan_empty(struct rchan_reader *reader)
+{
+       ssize_t avail_count;
+       u32 buffers_ready;
+       struct rchan *rchan = reader->rchan;
+       u32 cur_idx, curbuf_bytes;
+       int mapped;
+
+       if (atomic_read(&rchan->suspended) == 1)
+               return 0;
+
+       mapped = atomic_read(&rchan->mapped);
+       
+       if (mapped && bulk_delivery(rchan)) {
+               buffers_ready = rchan->bufs_produced - rchan->bufs_consumed;
+               return buffers_ready ? 0 : 1;
+       }
+
+       if (mapped && packet_delivery(rchan)) {
+               buffers_ready = rchan->bufs_produced - rchan->bufs_consumed;
+               if (buffers_ready)
+                       return 0;
+               else {
+                       cur_idx = relay_get_offset(rchan, NULL);
+                       curbuf_bytes = cur_idx % rchan->buf_size;
+                       return curbuf_bytes == rchan->bytes_consumed ? 1 : 0;
+               }
+       }
+
+       avail_count = relay_bytes_avail(reader);
+
+       return avail_count ? 0 : 1;
+}
+
+/**
+ *     rchan_full - boolean, is the channel full wrt consuming reader?
+ *     @reader: channel reader
+ *     
+ *     Returns 1 if the channel is full, 0 otherwise.
+ */
+int
+rchan_full(struct rchan_reader *reader)
+{
+       u32 buffers_ready;
+       struct rchan *rchan = reader->rchan;
+
+       if (mode_continuous(rchan))
+               return 0;
+
+       buffers_ready = rchan->bufs_produced - rchan->bufs_consumed;
+
+       return buffers_ready > reader->rchan->n_bufs - 1 ? 1 : 0;
+}
+
+/**
+ *     relay_info - get status and other information about a relay channel
+ *     @rchan_id: relay channel id
+ *     @rchan_info: pointer to the rchan_info struct to be filled in
+ *     
+ *     Fills in an rchan_info struct with channel status and attribute 
+ *     information.  See Documentation/filesystems/relayfs.txt for details.
+ *
+ *     Returns 0 if successful, negative otherwise.
+ */
+int 
+relay_info(int rchan_id, struct rchan_info *rchan_info)
+{
+       int i;
+       struct rchan *rchan;
+
+       rchan = rchan_get(rchan_id);
+       if (rchan == NULL)
+               return -EBADF;
+
+       rchan_info->flags = rchan->flags;
+       rchan_info->buf_size = rchan->buf_size;
+       rchan_info->buf_addr = rchan->buf;
+       rchan_info->alloc_size = rchan->alloc_size;
+       rchan_info->n_bufs = rchan->n_bufs;
+       rchan_info->cur_idx = relay_get_offset(rchan, NULL);
+       rchan_info->bufs_produced = rchan->bufs_produced;
+       rchan_info->bufs_consumed = rchan->bufs_consumed;
+       rchan_info->buf_id = rchan->buf_id;
+
+       for (i = 0; i < rchan->n_bufs; i++) {
+               rchan_info->unused_bytes[i] = rchan->unused_bytes[i];
+               if (using_lockless(rchan))
+                       rchan_info->buffer_complete[i] = (atomic_read(&fill_count(rchan, i)) == rchan->buf_size);
+               else
+                       rchan_info->buffer_complete[i] = 0;
+       }
+
+       rchan_put(rchan);
+
+       return 0;
+}
+
+/**
+ *     __add_rchan_reader - creates and adds a reader to a channel
+ *     @rchan: relay channel
+ *     @filp: the file associated with rchan, if applicable
+ *     @auto_consume: boolean, whether reader's reads automatically consume
+ *     @map_reader: boolean, whether reader's reading via a channel mapping
+ *
+ *     Returns a pointer to the reader object create, NULL if unsuccessful
+ *
+ *     Creates and initializes an rchan_reader object for reading the channel.
+ *     If filp is non-NULL, the reader is a VFS reader, otherwise not.
+ *
+ *     If the reader is a map reader, it isn't considered a VFS reader for
+ *     our purposes.  Also, map_readers can't be auto-consuming.
+ */
+struct rchan_reader *
+__add_rchan_reader(struct rchan *rchan, struct file *filp, int auto_consume, int map_reader)
+{
+       struct rchan_reader *reader;
+       u32 will_read;
+       
+       reader = kmalloc(sizeof(struct rchan_reader), GFP_KERNEL);
+
+       if (reader) {
+               write_lock(&rchan->open_readers_lock);
+               reader->rchan = rchan;
+               if (filp) {
+                       reader->vfs_reader = 1;
+                       reader->pos.file = filp;
+               } else {
+                       reader->vfs_reader = 0;
+                       reader->pos.f_pos = 0;
+               }
+               reader->map_reader = map_reader;
+               reader->auto_consume = auto_consume;
+
+               if (!map_reader) {
+                       will_read = rchan->bufs_produced % rchan->n_bufs;
+                       if (!will_read && atomic_read(&rchan->suspended))
+                               will_read = rchan->n_bufs;
+                       reader->bufs_consumed = rchan->bufs_produced - will_read;
+                       rchan->bufs_consumed = reader->bufs_consumed;
+                       rchan->bytes_consumed = reader->bytes_consumed = 0;
+                       reader->offset_changed = 0;
+               }
+               
+               list_add(&reader->list, &rchan->open_readers);
+               write_unlock(&rchan->open_readers_lock);
+       }
+
+       return reader;
+}
+
+/**
+ *     add_rchan_reader - create a reader for a channel
+ *     @rchan_id: relay channel handle
+ *     @auto_consume: boolean, whether reader's reads automatically consume
+ *
+ *     Returns a pointer to the reader object created, NULL if unsuccessful
+ *
+ *     Creates and initializes an rchan_reader object for reading the channel.
+ *     This function is useful only for non-VFS readers.
+ */
+struct rchan_reader *
+add_rchan_reader(int rchan_id, int auto_consume)
+{
+       struct rchan *rchan = rchan_get(rchan_id);
+       if (rchan == NULL)
+               return NULL;
+
+       return __add_rchan_reader(rchan, NULL, auto_consume, 0);
+}
+
+/**
+ *     add_map_reader - create a map reader for a channel
+ *     @rchan_id: relay channel handle
+ *
+ *     Returns a pointer to the reader object created, NULL if unsuccessful
+ *
+ *     Creates and initializes an rchan_reader object for reading the channel.
+ *     This function is useful only for map readers.
+ */
+struct rchan_reader *
+add_map_reader(int rchan_id)
+{
+       struct rchan *rchan = rchan_get(rchan_id);
+       if (rchan == NULL)
+               return NULL;
+
+       return __add_rchan_reader(rchan, NULL, 0, 1);
+}
+
+/**
+ *     __remove_rchan_reader - destroy a channel reader
+ *     @reader: channel reader
+ *
+ *     Internal - removes reader from the open readers list, and frees it.
+ */
+void
+__remove_rchan_reader(struct rchan_reader *reader)
+{
+       struct list_head *p;
+       struct rchan_reader *found_reader = NULL;
+       
+       write_lock(&reader->rchan->open_readers_lock);
+       list_for_each(p, &reader->rchan->open_readers) {
+               found_reader = list_entry(p, struct rchan_reader, list);
+               if (found_reader == reader) {
+                       list_del(&found_reader->list);
+                       break;
+               }
+       }
+       write_unlock(&reader->rchan->open_readers_lock);
+
+       if (found_reader)
+               kfree(found_reader);
+}
+
+/**
+ *     remove_rchan_reader - destroy a channel reader
+ *     @reader: channel reader
+ *
+ *     Finds and removes the given reader from the channel.  This function
+ *     is useful only for non-VFS readers.
+ *
+ *     Returns 0 if successful, negative otherwise.
+ */
+int 
+remove_rchan_reader(struct rchan_reader *reader)
+{
+       int err = 0;
+       
+       if (reader) {
+               rchan_put(reader->rchan);
+               __remove_rchan_reader(reader);
+       } else
+               err = -EINVAL;
+
+       return err;
+}
+
+/**
+ *     remove_map_reader - destroy a map reader
+ *     @reader: channel reader
+ *
+ *     Finds and removes the given map reader from the channel.  This function
+ *     is useful only for map readers.
+ *
+ *     Returns 0 if successful, negative otherwise.
+ */
+int 
+remove_map_reader(struct rchan_reader *reader)
+{
+       return remove_rchan_reader(reader);
+}
+
+EXPORT_SYMBOL(relay_open);
+EXPORT_SYMBOL(relay_close);
+EXPORT_SYMBOL(relay_reset);
+EXPORT_SYMBOL(relay_reserve);
+EXPORT_SYMBOL(relay_commit);
+EXPORT_SYMBOL(relay_read);
+EXPORT_SYMBOL(relay_write);
+EXPORT_SYMBOL(relay_bytes_avail);
+EXPORT_SYMBOL(relay_buffers_consumed);
+EXPORT_SYMBOL(relay_bytes_consumed);
+EXPORT_SYMBOL(relay_info);
+EXPORT_SYMBOL(relay_discard_init_buf);
+
+
diff --git a/include/asm-um/cpufeature.h b/include/asm-um/cpufeature.h
new file mode 100644 (file)
index 0000000..fb7bd42
--- /dev/null
@@ -0,0 +1,6 @@
+#ifndef __UM_CPUFEATURE_H
+#define __UM_CPUFEATURE_H
+
+#include "asm/arch/cpufeature.h"
+
+#endif
diff --git a/include/asm-um/local.h b/include/asm-um/local.h
new file mode 100644 (file)
index 0000000..9a280c5
--- /dev/null
@@ -0,0 +1,6 @@
+#ifndef __UM_LOCAL_H
+#define __UM_LOCAL_H
+
+#include "asm/arch/local.h"
+
+#endif
diff --git a/include/asm-um/module-generic.h b/include/asm-um/module-generic.h
new file mode 100644 (file)
index 0000000..5a265f5
--- /dev/null
@@ -0,0 +1,6 @@
+#ifndef __UM_MODULE_GENERIC_H
+#define __UM_MODULE_GENERIC_H
+
+#include "asm/arch/module.h"
+
+#endif
diff --git a/include/asm-um/sections.h b/include/asm-um/sections.h
new file mode 100644 (file)
index 0000000..6b0231e
--- /dev/null
@@ -0,0 +1,7 @@
+#ifndef _UM_SECTIONS_H
+#define _UM_SECTIONS_H
+
+/* nothing to see, move along */
+#include <asm-generic/sections.h>
+
+#endif
diff --git a/include/linux/relayfs_fs.h b/include/linux/relayfs_fs.h
new file mode 100644 (file)
index 0000000..2c52874
--- /dev/null
@@ -0,0 +1,686 @@
+/*
+ * linux/include/linux/relayfs_fs.h
+ *
+ * Copyright (C) 2002, 2003 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
+ * Copyright (C) 1999, 2000, 2001, 2002 - Karim Yaghmour (karim@opersys.com)
+ *
+ * RelayFS definitions and declarations
+ *
+ * Please see Documentation/filesystems/relayfs.txt for more info.
+ */
+
+#ifndef _LINUX_RELAYFS_FS_H
+#define _LINUX_RELAYFS_FS_H
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/fs.h>
+
+/*
+ * Tracks changes to rchan struct
+ */
+#define RELAYFS_CHANNEL_VERSION                1
+
+/*
+ * Maximum number of simultaneously open channels
+ */
+#define RELAY_MAX_CHANNELS             256
+
+/*
+ * Relay properties
+ */
+#define RELAY_MIN_BUFS                 2
+#define RELAY_MIN_BUFSIZE              4096
+#define RELAY_MAX_BUFS                 256
+#define RELAY_MAX_BUF_SIZE             0x1000000
+#define RELAY_MAX_TOTAL_BUF_SIZE       0x8000000
+
+/*
+ * Lockless scheme utility macros
+ */
+#define RELAY_MAX_BUFNO(bufno_bits) (1UL << (bufno_bits))
+#define RELAY_BUF_SIZE(offset_bits) (1UL << (offset_bits))
+#define RELAY_BUF_OFFSET_MASK(offset_bits) (RELAY_BUF_SIZE(offset_bits) - 1)
+#define RELAY_BUFNO_GET(index, offset_bits) ((index) >> (offset_bits))
+#define RELAY_BUF_OFFSET_GET(index, mask) ((index) & (mask))
+#define RELAY_BUF_OFFSET_CLEAR(index, mask) ((index) & ~(mask))
+
+/*
+ * Flags returned by relay_reserve()
+ */
+#define RELAY_BUFFER_SWITCH_NONE       0x0
+#define RELAY_WRITE_DISCARD_NONE       0x0
+#define RELAY_BUFFER_SWITCH            0x1
+#define RELAY_WRITE_DISCARD            0x2
+#define RELAY_WRITE_TOO_LONG           0x4
+
+/*
+ * Relay attribute flags
+ */
+#define RELAY_DELIVERY_BULK            0x1
+#define RELAY_DELIVERY_PACKET          0x2
+#define RELAY_SCHEME_LOCKLESS          0x4
+#define RELAY_SCHEME_LOCKING           0x8
+#define RELAY_SCHEME_ANY               0xC
+#define RELAY_TIMESTAMP_TSC            0x10
+#define RELAY_TIMESTAMP_GETTIMEOFDAY   0x20
+#define RELAY_TIMESTAMP_ANY            0x30
+#define RELAY_USAGE_SMP                        0x40
+#define RELAY_USAGE_GLOBAL             0x80
+#define RELAY_MODE_CONTINUOUS          0x100
+#define RELAY_MODE_NO_OVERWRITE                0x200
+
+/*
+ * Flags for needs_resize() callback
+ */
+#define RELAY_RESIZE_NONE      0x0
+#define RELAY_RESIZE_EXPAND    0x1
+#define RELAY_RESIZE_SHRINK    0x2
+#define RELAY_RESIZE_REPLACE   0x4
+#define RELAY_RESIZE_REPLACED  0x8
+
+/*
+ * Values for fileop_notify() callback
+ */
+enum relay_fileop
+{
+       RELAY_FILE_OPEN,
+       RELAY_FILE_CLOSE,
+       RELAY_FILE_MAP,
+       RELAY_FILE_UNMAP
+};
+
+/*
+ * Data structure returned by relay_info()
+ */
+struct rchan_info
+{
+       u32 flags;              /* relay attribute flags for channel */
+       u32 buf_size;           /* channel's sub-buffer size */
+       char *buf_addr;         /* address of channel start */
+       u32 alloc_size;         /* total buffer size actually allocated */
+       u32 n_bufs;             /* number of sub-buffers in channel */
+       u32 cur_idx;            /* current write index into channel */
+       u32 bufs_produced;      /* current count of sub-buffers produced */
+       u32 bufs_consumed;      /* current count of sub-buffers consumed */
+       u32 buf_id;             /* buf_id of current sub-buffer */
+       int buffer_complete[RELAY_MAX_BUFS];    /* boolean per sub-buffer */
+       int unused_bytes[RELAY_MAX_BUFS];       /* count per sub-buffer */
+};
+
+/*
+ * Relay channel client callbacks
+ */
+struct rchan_callbacks
+{
+       /*
+        * buffer_start - called at the beginning of a new sub-buffer
+        * @rchan_id: the channel id
+        * @current_write_pos: position in sub-buffer client should write to
+        * @buffer_id: the id of the new sub-buffer
+        * @start_time: the timestamp associated with the start of sub-buffer
+        * @start_tsc: the TSC associated with the timestamp, if using_tsc
+        * @using_tsc: boolean, indicates whether start_tsc is valid
+        *
+        * Return value should be the number of bytes written by the client.
+        *
+        * See Documentation/filesystems/relayfs.txt for details.
+        */
+       int (*buffer_start) (int rchan_id,
+                            char *current_write_pos,
+                            u32 buffer_id,
+                            struct timeval start_time,
+                            u32 start_tsc,
+                            int using_tsc);
+
+       /*
+        * buffer_end - called at the end of a sub-buffer
+        * @rchan_id: the channel id
+        * @current_write_pos: position in sub-buffer of end of data
+        * @end_of_buffer: the position of the end of the sub-buffer
+        * @end_time: the timestamp associated with the end of the sub-buffer
+        * @end_tsc: the TSC associated with the end_time, if using_tsc
+        * @using_tsc: boolean, indicates whether end_tsc is valid
+        *
+        * Return value should be the number of bytes written by the client.
+        *
+        * See Documentation/filesystems/relayfs.txt for details.
+        */
+       int (*buffer_end) (int rchan_id,
+                          char *current_write_pos,
+                          char *end_of_buffer,
+                          struct timeval end_time,
+                          u32 end_tsc,
+                          int using_tsc);
+
+       /*
+        * deliver - called when data is ready for the client
+        * @rchan_id: the channel id
+        * @from: the start of the delivered data
+        * @len: the length of the delivered data
+        *
+        * See Documentation/filesystems/relayfs.txt for details.
+        */
+       void (*deliver) (int rchan_id, char *from, u32 len);
+
+       /*
+        * user_deliver - called when data has been written from userspace
+        * @rchan_id: the channel id
+        * @from: the start of the delivered data
+        * @len: the length of the delivered data
+        *
+        * See Documentation/filesystems/relayfs.txt for details.
+        */
+       void (*user_deliver) (int rchan_id, char *from, u32 len);
+
+       /*
+        * needs_resize - called when a resizing event occurs
+        * @rchan_id: the channel id
+        * @resize_type: the type of resizing event
+        * @suggested_buf_size: the suggested new sub-buffer size
+        * @suggested_buf_size: the suggested new number of sub-buffers
+        *
+        * See Documentation/filesystems/relayfs.txt for details.
+        */
+       void (*needs_resize)(int rchan_id,
+                            int resize_type,
+                            u32 suggested_buf_size,
+                            u32 suggested_n_bufs);
+
+       /*
+        * fileop_notify - called on open/close/mmap/munmap of a relayfs file
+        * @rchan_id: the channel id
+        * @filp: relayfs file pointer
+        * @fileop: which file operation is in progress
+        *
+        * The return value can direct the outcome of the operation.
+        *
+        * See Documentation/filesystems/relayfs.txt for details.
+        */
+        int (*fileop_notify)(int rchan_id,
+                            struct file *filp,
+                            enum relay_fileop fileop);
+
+       /*
+        * ioctl - called in ioctl context from userspace
+        * @rchan_id: the channel id
+        * @cmd: ioctl cmd
+        * @arg: ioctl cmd arg
+        *
+        * The return value is returned as the value from the ioctl call.
+        *
+        * See Documentation/filesystems/relayfs.txt for details.
+        */
+       int (*ioctl) (int rchan_id, unsigned int cmd, unsigned long arg);
+};
+
+/*
+ * Lockless scheme-specific data
+ */
+struct lockless_rchan
+{
+       u8 bufno_bits;          /* # bits used for sub-buffer id */
+       u8 offset_bits;         /* # bits used for offset within sub-buffer */
+       u32 index;              /* current index = sub-buffer id and offset */
+       u32 offset_mask;        /* used to obtain offset portion of index */
+       u32 index_mask;         /* used to mask off unused bits index */
+       atomic_t fill_count[RELAY_MAX_BUFS];    /* fill count per sub-buffer */
+};
+
+/*
+ * Locking scheme-specific data
+ */
+struct locking_rchan
+{
+       char *write_buf;                /* start of write sub-buffer */
+       char *write_buf_end;            /* end of write sub-buffer */
+       char *current_write_pos;        /* current write pointer */
+       char *write_limit;              /* takes reserves into account */
+       char *in_progress_event_pos;    /* used for interrupted writes */
+       u16 in_progress_event_size;     /* used for interrupted writes */
+       char *interrupted_pos;          /* used for interrupted writes */
+       u16 interrupting_size;          /* used for interrupted writes */
+       spinlock_t lock;                /* channel lock for locking scheme */
+};
+
+struct relay_ops;
+
+/*
+ * Offset resizing data structure
+ */
+struct resize_offset
+{
+       u32 ge;
+       u32 le;
+       int delta;
+};
+
+/*
+ * Relay channel data structure
+ */
+struct rchan
+{
+       u32 version;                    /* the version of this struct */
+       char *buf;                      /* the channel buffer */
+       union
+       {
+               struct lockless_rchan lockless;
+               struct locking_rchan locking;
+       } scheme;                       /* scheme-specific channel data */
+
+       int id;                         /* the channel id */
+       struct rchan_callbacks *callbacks;      /* client callbacks */
+       u32 flags;                      /* relay channel attributes */
+       u32 buf_id;                     /* current sub-buffer id */
+       u32 buf_idx;                    /* current sub-buffer index */
+
+       atomic_t mapped;                /* map count */
+
+       atomic_t suspended;             /* channel suspended i.e full? */
+       int half_switch;                /* used internally for suspend */
+
+       struct timeval  buf_start_time; /* current sub-buffer start time */
+       u32 buf_start_tsc;              /* current sub-buffer start TSC */
+       
+       u32 buf_size;                   /* sub-buffer size */
+       u32 alloc_size;                 /* total buffer size allocated */
+       u32 n_bufs;                     /* number of sub-buffers */
+
+       u32 bufs_produced;              /* count of sub-buffers produced */
+       u32 bufs_consumed;              /* count of sub-buffers consumed */
+       u32 bytes_consumed;             /* bytes consumed in cur sub-buffer */
+
+       int initialized;                /* first buffer initialized? */
+       int finalized;                  /* channel finalized? */
+
+       u32 start_reserve;              /* reserve at start of sub-buffers */
+       u32 end_reserve;                /* reserve at end of sub-buffers */
+       u32 rchan_start_reserve;        /* additional reserve sub-buffer 0 */
+       
+       struct dentry *dentry;          /* channel file dentry */
+
+       wait_queue_head_t read_wait;    /* VFS read wait queue */
+       wait_queue_head_t write_wait;   /* VFS write wait queue */
+       struct work_struct wake_readers; /* reader wake-up work struct */
+       struct work_struct wake_writers; /* reader wake-up work struct */
+       atomic_t refcount;              /* channel refcount */
+
+       struct relay_ops *relay_ops;    /* scheme-specific channel ops */
+
+       int unused_bytes[RELAY_MAX_BUFS]; /* unused count per sub-buffer */
+
+       struct semaphore resize_sem;    /* serializes alloc/repace */
+       struct work_struct work;        /* resize allocation work struct */
+
+       struct list_head open_readers;  /* open readers for this channel */
+       rwlock_t open_readers_lock;     /* protection for open_readers list */
+
+       char *init_buf;                 /* init channel buffer, if non-NULL */
+       
+       u32 resize_min;                 /* minimum resized total buffer size */
+       u32 resize_max;                 /* maximum resized total buffer size */
+       char *resize_buf;               /* for autosize alloc/free */
+       u32 resize_buf_size;            /* resized sub-buffer size */
+       u32 resize_n_bufs;              /* resized number of sub-buffers */
+       u32 resize_alloc_size;          /* resized actual total size */
+       int resizing;                   /* is resizing in progress? */
+       int resize_err;                 /* resizing err code */
+       int resize_failures;            /* number of resize failures */
+       int replace_buffer;             /* is the alloced buffer ready?  */
+       struct resize_offset resize_offset; /* offset change */
+       struct timer_list shrink_timer; /* timer used for shrinking */
+       int resize_order;               /* size of last resize */
+       u32 expand_buf_id;              /* subbuf id expand will occur at */
+
+       struct page **buf_page_array;   /* array of current buffer pages */
+       int buf_page_count;             /* number of current buffer pages */
+       struct page **expand_page_array;/* new pages to be inserted */
+       int expand_page_count;          /* number of new pages */
+       struct page **shrink_page_array;/* old pages to be freed */
+       int shrink_page_count;          /* number of old pages */
+       struct page **resize_page_array;/* will become current pages */
+       int resize_page_count;          /* number of resize pages */
+       struct page **old_buf_page_array; /* hold for freeing */
+} ____cacheline_aligned;
+
+/*
+ * Relay channel reader struct
+ */
+struct rchan_reader
+{
+       struct list_head list;          /* for list inclusion */
+       struct rchan *rchan;            /* the channel we're reading from */
+       int auto_consume;               /* does this reader auto-consume? */
+       u32 bufs_consumed;              /* buffers this reader has consumed */
+       u32 bytes_consumed;             /* bytes consumed in cur sub-buffer */
+       int offset_changed;             /* have channel offsets changed? */
+       int vfs_reader;                 /* are we a VFS reader? */
+       int map_reader;                 /* are we an mmap reader? */
+
+       union
+       {
+               struct file *file;
+               u32 f_pos;
+       } pos;                          /* current read offset */
+};
+
+/*
+ * These help make union member access less tedious
+ */
+#define channel_buffer(rchan) ((rchan)->buf)
+#define idx(rchan) ((rchan)->scheme.lockless.index)
+#define bufno_bits(rchan) ((rchan)->scheme.lockless.bufno_bits)
+#define offset_bits(rchan) ((rchan)->scheme.lockless.offset_bits)
+#define offset_mask(rchan) ((rchan)->scheme.lockless.offset_mask)
+#define idx_mask(rchan) ((rchan)->scheme.lockless.index_mask)
+#define bulk_delivery(rchan) (((rchan)->flags & RELAY_DELIVERY_BULK) ? 1 : 0)
+#define packet_delivery(rchan) (((rchan)->flags & RELAY_DELIVERY_PACKET) ? 1 : 0)
+#define using_lockless(rchan) (((rchan)->flags & RELAY_SCHEME_LOCKLESS) ? 1 : 0)
+#define using_locking(rchan) (((rchan)->flags & RELAY_SCHEME_LOCKING) ? 1 : 0)
+#define using_tsc(rchan) (((rchan)->flags & RELAY_TIMESTAMP_TSC) ? 1 : 0)
+#define using_gettimeofday(rchan) (((rchan)->flags & RELAY_TIMESTAMP_GETTIMEOFDAY) ? 1 : 0)
+#define usage_smp(rchan) (((rchan)->flags & RELAY_USAGE_SMP) ? 1 : 0)
+#define usage_global(rchan) (((rchan)->flags & RELAY_USAGE_GLOBAL) ? 1 : 0)
+#define mode_continuous(rchan) (((rchan)->flags & RELAY_MODE_CONTINUOUS) ? 1 : 0)
+#define fill_count(rchan, i) ((rchan)->scheme.lockless.fill_count[(i)])
+#define write_buf(rchan) ((rchan)->scheme.locking.write_buf)
+#define read_buf(rchan) ((rchan)->scheme.locking.read_buf)
+#define write_buf_end(rchan) ((rchan)->scheme.locking.write_buf_end)
+#define read_buf_end(rchan) ((rchan)->scheme.locking.read_buf_end)
+#define cur_write_pos(rchan) ((rchan)->scheme.locking.current_write_pos)
+#define read_limit(rchan) ((rchan)->scheme.locking.read_limit)
+#define write_limit(rchan) ((rchan)->scheme.locking.write_limit)
+#define in_progress_event_pos(rchan) ((rchan)->scheme.locking.in_progress_event_pos)
+#define in_progress_event_size(rchan) ((rchan)->scheme.locking.in_progress_event_size)
+#define interrupted_pos(rchan) ((rchan)->scheme.locking.interrupted_pos)
+#define interrupting_size(rchan) ((rchan)->scheme.locking.interrupting_size)
+#define channel_lock(rchan) ((rchan)->scheme.locking.lock)
+
+
+/**
+ *     calc_time_delta - utility function for time delta calculation
+ *     @now: current time
+ *     @start: start time
+ *
+ *     Returns the time delta produced by subtracting start time from now.
+ */
+static inline u32
+calc_time_delta(struct timeval *now, 
+               struct timeval *start)
+{
+       return (now->tv_sec - start->tv_sec) * 1000000
+               + (now->tv_usec - start->tv_usec);
+}
+
+/**
+ *     recalc_time_delta - utility function for time delta recalculation
+ *     @now: current time
+ *     @new_delta: the new time delta calculated
+ *     @cpu: the associated CPU id
+ */
+static inline void 
+recalc_time_delta(struct timeval *now,
+                 u32 *new_delta,
+                 struct rchan *rchan)
+{
+       if (using_tsc(rchan) == 0)
+               *new_delta = calc_time_delta(now, &rchan->buf_start_time);
+}
+
+/**
+ *     have_cmpxchg - does this architecture have a cmpxchg?
+ *
+ *     Returns 1 if this architecture has a cmpxchg useable by 
+ *     the lockless scheme, 0 otherwise.
+ */
+static inline int 
+have_cmpxchg(void)
+{
+#if defined(__HAVE_ARCH_CMPXCHG)
+       return 1;
+#else
+       return 0;
+#endif
+}
+
+/**
+ *     relay_write_direct - write data directly into destination buffer
+ */
+#define relay_write_direct(DEST, SRC, SIZE) \
+do\
+{\
+   memcpy(DEST, SRC, SIZE);\
+   DEST += SIZE;\
+} while (0);
+
+/**
+ *     relay_lock_channel - lock the relay channel if applicable
+ *
+ *     This macro only affects the locking scheme.  If the locking scheme
+ *     is in use and the channel usage is SMP, does a local_irq_save.  If the 
+ *     locking sheme is in use and the channel usage is GLOBAL, uses 
+ *     spin_lock_irqsave.  FLAGS is initialized to 0 since we know that
+ *     it is being initialized prior to use and we avoid the compiler warning.
+ */
+#define relay_lock_channel(RCHAN, FLAGS) \
+do\
+{\
+   FLAGS = 0;\
+   if (using_locking(RCHAN)) {\
+      if (usage_smp(RCHAN)) {\
+         local_irq_save(FLAGS); \
+      } else {\
+         spin_lock_irqsave(&(RCHAN)->scheme.locking.lock, FLAGS); \
+      }\
+   }\
+} while (0);
+
+/**
+ *     relay_unlock_channel - unlock the relay channel if applicable
+ *
+ *     This macro only affects the locking scheme.  See relay_lock_channel.
+ */
+#define relay_unlock_channel(RCHAN, FLAGS) \
+do\
+{\
+   if (using_locking(RCHAN)) {\
+      if (usage_smp(RCHAN)) {\
+         local_irq_restore(FLAGS); \
+      } else {\
+         spin_unlock_irqrestore(&(RCHAN)->scheme.locking.lock, FLAGS); \
+      }\
+   }\
+} while (0);
+
+/*
+ * Define cmpxchg if we don't have it
+ */
+#ifndef __HAVE_ARCH_CMPXCHG
+#define cmpxchg(p,o,n) 0
+#endif
+
+/*
+ * High-level relayfs kernel API, fs/relayfs/relay.c
+ */
+extern int
+relay_open(const char *chanpath,
+          int bufsize,
+          int nbufs,
+          u32 flags,
+          struct rchan_callbacks *channel_callbacks,
+          u32 start_reserve,
+          u32 end_reserve,
+          u32 rchan_start_reserve,
+          u32 resize_min,
+          u32 resize_max,
+          int mode,
+          char *init_buf,
+          u32 init_buf_size);
+
+extern int
+relay_close(int rchan_id);
+
+extern int
+relay_write(int rchan_id,
+           const void *data_ptr, 
+           size_t count,
+           int td_offset,
+           void **wrote_pos);
+
+extern ssize_t
+relay_read(struct rchan_reader *reader,
+          char *buf,
+          size_t count,
+          int wait,
+          u32 *actual_read_offset);
+
+extern int
+relay_discard_init_buf(int rchan_id);
+
+extern struct rchan_reader *
+add_rchan_reader(int rchan_id, int autoconsume);
+
+extern int
+remove_rchan_reader(struct rchan_reader *reader);
+
+extern struct rchan_reader *
+add_map_reader(int rchan_id);
+
+extern int
+remove_map_reader(struct rchan_reader *reader);
+
+extern int 
+relay_info(int rchan_id, struct rchan_info *rchan_info);
+
+extern void 
+relay_buffers_consumed(struct rchan_reader *reader, u32 buffers_consumed);
+
+extern void
+relay_bytes_consumed(struct rchan_reader *reader, u32 bytes_consumed, u32 read_offset);
+
+extern ssize_t
+relay_bytes_avail(struct rchan_reader *reader);
+
+extern int
+relay_realloc_buffer(int rchan_id, u32 new_nbufs, int in_background);
+
+extern int
+relay_replace_buffer(int rchan_id);
+
+extern int
+rchan_empty(struct rchan_reader *reader);
+
+extern int
+rchan_full(struct rchan_reader *reader);
+
+extern void
+update_readers_consumed(struct rchan *rchan, u32 bufs_consumed, u32 bytes_consumed);
+
+extern int 
+__relay_mmap_buffer(struct rchan *rchan, struct vm_area_struct *vma);
+
+extern struct rchan_reader *
+__add_rchan_reader(struct rchan *rchan, struct file *filp, int auto_consume, int map_reader);
+
+extern void
+__remove_rchan_reader(struct rchan_reader *reader);
+
+/*
+ * Low-level relayfs kernel API, fs/relayfs/relay.c
+ */
+extern struct rchan *
+rchan_get(int rchan_id);
+
+extern void
+rchan_put(struct rchan *rchan);
+
+extern char *
+relay_reserve(struct rchan *rchan,
+             u32 data_len,
+             struct timeval *time_stamp,
+             u32 *time_delta,
+             int *errcode,
+             int *interrupting);
+
+extern void 
+relay_commit(struct rchan *rchan,
+            char *from, 
+            u32 len, 
+            int reserve_code,
+            int interrupting);
+
+extern u32 
+relay_get_offset(struct rchan *rchan, u32 *max_offset);
+
+extern int
+relay_reset(int rchan_id);
+
+/*
+ * VFS functions, fs/relayfs/inode.c
+ */
+extern int 
+relayfs_create_dir(const char *name, 
+                  struct dentry *parent, 
+                  struct dentry **dentry);
+
+extern int
+relayfs_create_file(const char * name,
+                   struct dentry *parent, 
+                   struct dentry **dentry,
+                   void * data,
+                   int mode);
+
+extern int 
+relayfs_remove_file(struct dentry *dentry);
+
+extern int
+reset_index(struct rchan *rchan, u32 old_index);
+
+
+/*
+ * klog functions, fs/relayfs/klog.c
+ */
+extern int
+create_klog_channel(void);
+
+extern int
+remove_klog_channel(void);
+
+/*
+ * Scheme-specific channel ops
+ */
+struct relay_ops
+{
+       char * (*reserve) (struct rchan *rchan,
+                          u32 slot_len,
+                          struct timeval *time_stamp,
+                          u32 *tsc,
+                          int * errcode,
+                          int * interrupting);
+       
+       void (*commit) (struct rchan *rchan,
+                       char *from,
+                       u32 len, 
+                       int deliver, 
+                       int interrupting);
+
+       u32 (*get_offset) (struct rchan *rchan,
+                          u32 *max_offset);
+       
+       void (*resume) (struct rchan *rchan);
+       void (*finalize) (struct rchan *rchan);
+       void (*reset) (struct rchan *rchan,
+                      int init);
+       int (*reset_index) (struct rchan *rchan,
+                           u32 old_index);
+};
+
+#endif /* _LINUX_RELAYFS_FS_H */
+
+
+
+
+
diff --git a/include/linux/vs_base.h b/include/linux/vs_base.h
new file mode 100644 (file)
index 0000000..4f04513
--- /dev/null
@@ -0,0 +1,78 @@
+#ifndef _VX_VS_BASE_H
+#define _VX_VS_BASE_H
+
+#include "vserver/context.h"
+
+// #define VX_DEBUG
+
+
+#if defined(VX_DEBUG)
+#define vxdprintk(x...) printk("vxd: " x)
+#else
+#define vxdprintk(x...)
+#endif
+
+
+#define vx_task_xid(t) ((t)->xid)
+
+#define vx_current_xid() vx_task_xid(current)
+
+#define vx_check(c,m)  __vx_check(vx_current_xid(),c,m)
+
+#define vx_weak_check(c,m)     ((m) ? vx_check(c,m) : 1)
+
+
+/*
+ * check current context for ADMIN/WATCH and
+ * optionally agains supplied argument
+ */
+static __inline__ int __vx_check(xid_t cid, xid_t id, unsigned int mode)
+{
+       if (mode & VX_ARG_MASK) {
+               if ((mode & VX_IDENT) &&
+                       (id == cid))
+                       return 1;
+       }
+       if (mode & VX_ATR_MASK) {
+               if ((mode & VX_DYNAMIC) &&
+                       (id >= MIN_D_CONTEXT) &&
+                       (id <= MAX_S_CONTEXT))
+                       return 1;
+               if ((mode & VX_STATIC) &&
+                       (id > 1) && (id < MIN_D_CONTEXT))
+                       return 1;
+       }
+       return (((mode & VX_ADMIN) && (cid == 0)) ||
+               ((mode & VX_WATCH) && (cid == 1)));
+}
+
+
+#define __vx_flags(v,m,f)      (((v) & (m)) ^ (f))
+
+#define        __vx_task_flags(t,m,f) \
+       (((t) && ((t)->vx_info)) ? \
+               __vx_flags((t)->vx_info->vx_flags,(m),(f)) : 0)
+
+#define vx_current_flags() \
+       ((current->vx_info) ? current->vx_info->vx_flags : 0)
+
+#define vx_flags(m,f)  __vx_flags(vx_current_flags(),(m),(f))
+
+
+#define vx_current_ccaps() \
+       ((current->vx_info) ? current->vx_info->vx_ccaps : 0)
+
+#define vx_ccaps(c)    (vx_current_ccaps() & (c))
+
+#define vx_current_bcaps() \
+       (((current->vx_info) && !vx_flags(VXF_STATE_SETUP, 0)) ? \
+       current->vx_info->vx_bcaps : cap_bset)
+
+
+/* generic flag merging */
+
+#define        vx_mask_flags(v,f,m)    (((v) & ~(m)) | ((f) & (m)))
+
+#define        vx_mask_mask(v,f,m)     (((v) & ~(m)) | ((v) & (f) & (m)))
+
+#endif
diff --git a/include/linux/vs_context.h b/include/linux/vs_context.h
new file mode 100644 (file)
index 0000000..727a16c
--- /dev/null
@@ -0,0 +1,128 @@
+#ifndef _VX_VS_CONTEXT_H
+#define _VX_VS_CONTEXT_H
+
+
+// #define VX_DEBUG
+
+#include <linux/kernel.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+
+#include "vserver/context.h"
+
+#undef vxdprintk
+#if defined(VX_DEBUG)
+#define vxdprintk(x...) printk("vxd: " x)
+#else
+#define vxdprintk(x...)
+#endif
+
+
+
+extern int proc_pid_vx_info(struct task_struct *, char *);
+
+
+#define get_vx_info(i) __get_vx_info(i,__FILE__,__LINE__)
+
+static inline struct vx_info *__get_vx_info(struct vx_info *vxi,
+       const char *_file, int _line)
+{
+       if (!vxi)
+               return NULL;
+       vxdprintk("get_vx_info(%p[#%d.%d])\t%s:%d\n",
+               vxi, vxi?vxi->vx_id:0, vxi?atomic_read(&vxi->vx_usecnt):0,
+               _file, _line);
+       atomic_inc(&vxi->vx_usecnt);
+       return vxi;
+}
+
+
+#define        free_vx_info(i) \
+       call_rcu(&i->vx_rcu, rcu_free_vx_info, i);
+
+#define put_vx_info(i) __put_vx_info(i,__FILE__,__LINE__)
+
+static inline void __put_vx_info(struct vx_info *vxi, const char *_file, int _line)
+{
+       if (!vxi)
+               return;
+       vxdprintk("put_vx_info(%p[#%d.%d])\t%s:%d\n",
+               vxi, vxi?vxi->vx_id:0, vxi?atomic_read(&vxi->vx_usecnt):0,
+               _file, _line);
+       if (atomic_dec_and_test(&vxi->vx_usecnt))
+               free_vx_info(vxi);
+}
+
+#define set_vx_info(p,i) __set_vx_info(p,i,__FILE__,__LINE__)
+
+static inline void __set_vx_info(struct vx_info **vxp, struct vx_info *vxi,
+       const char *_file, int _line)
+{
+       BUG_ON(*vxp);
+       if (!vxi)
+               return;
+       vxdprintk("set_vx_info(%p[#%d.%d.%d])\t%s:%d\n",
+               vxi, vxi?vxi->vx_id:0,
+               vxi?atomic_read(&vxi->vx_usecnt):0,
+               vxi?atomic_read(&vxi->vx_refcnt):0,
+               _file, _line);
+       atomic_inc(&vxi->vx_refcnt);
+       *vxp = __get_vx_info(vxi, _file, _line);
+}
+
+#define        clr_vx_info(p)  __clr_vx_info(p,__FILE__,__LINE__)
+
+static inline void __clr_vx_info(struct vx_info **vxp,
+       const char *_file, int _line)
+{
+       struct vx_info *vxo = *vxp;
+
+       if (!vxo)
+               return;
+       vxdprintk("clr_vx_info(%p[#%d.%d.%d])\t%s:%d\n",
+               vxo, vxo?vxo->vx_id:0,
+               vxo?atomic_read(&vxo->vx_usecnt):0,
+               vxo?atomic_read(&vxo->vx_refcnt):0,
+               _file, _line);
+       *vxp = NULL;
+       wmb();
+       if (vxo && atomic_dec_and_test(&vxo->vx_refcnt))
+               unhash_vx_info(vxo);
+       __put_vx_info(vxo, _file, _line);
+}
+
+
+#define task_get_vx_info(i)    __task_get_vx_info(i,__FILE__,__LINE__)
+
+static __inline__ struct vx_info *__task_get_vx_info(struct task_struct *p,
+       const char *_file, int _line)
+{
+       struct vx_info *vxi;
+       
+       task_lock(p);
+       vxi = __get_vx_info(p->vx_info, _file, _line);
+       task_unlock(p);
+       return vxi;
+}
+
+
+#define vx_verify_info(p,i)    \
+       __vx_verify_info((p)->vx_info,i,__FILE__,__LINE__)
+
+static __inline__ void __vx_verify_info(
+       struct vx_info *vxa, struct vx_info *vxb,
+       const char *_file, int _line)
+{
+       if (vxa == vxb)
+               return;
+       printk(KERN_ERR "vx bad assumption (%p==%p) at %s:%d\n",
+               vxa, vxb, _file, _line);
+}
+
+
+#undef vxdprintk
+#define vxdprintk(x...)
+
+#else
+#warning duplicate inclusion
+#endif
diff --git a/include/linux/vs_cvirt.h b/include/linux/vs_cvirt.h
new file mode 100644 (file)
index 0000000..65f4303
--- /dev/null
@@ -0,0 +1,71 @@
+#ifndef _VX_VS_CVIRT_H
+#define _VX_VS_CVIRT_H
+
+
+// #define VX_DEBUG
+
+#include "vserver/cvirt.h"
+#include "vs_base.h"
+
+#if defined(VX_DEBUG)
+#define vxdprintk(x...) printk("vxd: " x)
+#else
+#define vxdprintk(x...)
+#endif
+
+
+/* utsname virtualization */
+
+static inline struct new_utsname *vx_new_utsname(void)
+{
+       if (current->vx_info)
+               return &current->vx_info->cvirt.utsname;
+       return &system_utsname;
+}
+
+#define vx_new_uts(x)          ((vx_new_utsname())->x)
+
+
+/* pid faking stuff */
+
+
+#define vx_map_tgid(v,p) \
+       __vx_map_tgid((v), (p), __FILE__, __LINE__)
+
+static inline int __vx_map_tgid(struct vx_info *vxi, int pid,
+       char *file, int line)
+{
+       if (vxi && __vx_flags(vxi->vx_flags, VXF_INFO_INIT, 0)) {
+               vxdprintk("vx_map_tgid: %p/%llx: %d -> %d in %s:%d\n",
+                       vxi, vxi->vx_flags, pid,
+                       (pid == vxi->vx_initpid)?1:pid,
+                       file, line);
+               if (pid == vxi->vx_initpid)
+                       return 1;
+       }
+       return pid;
+}
+
+#define vx_rmap_tgid(v,p) \
+       __vx_rmap_tgid((v), (p), __FILE__, __LINE__)
+
+static inline int __vx_rmap_tgid(struct vx_info *vxi, int pid,
+       char *file, int line)
+{
+       if (vxi && __vx_flags(vxi->vx_flags, VXF_INFO_INIT, 0)) {
+               vxdprintk("vx_rmap_tgid: %p/%llx: %d -> %d in %s:%d\n",
+                       vxi, vxi->vx_flags, pid,
+                       (pid == 1)?vxi->vx_initpid:pid,
+                       file, line);
+               if ((pid == 1) && vxi->vx_initpid)
+                       return vxi->vx_initpid;
+       }
+       return pid;
+}
+
+#undef vxdprintk
+#define vxdprintk(x...)
+
+#else
+#warning duplicate inclusion
+#endif
diff --git a/include/linux/vs_dlimit.h b/include/linux/vs_dlimit.h
new file mode 100644 (file)
index 0000000..d80c563
--- /dev/null
@@ -0,0 +1,169 @@
+#ifndef _VX_VS_DLIMIT_H
+#define _VX_VS_DLIMIT_H
+
+
+// #define VX_DEBUG
+
+#include <linux/kernel.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+
+#include "vserver/context.h"
+#include "vserver/dlimit.h"
+
+#if defined(VX_DEBUG)
+#define vxdprintk(x...) printk("vxd: " x)
+#else
+#define vxdprintk(x...)
+#endif
+
+
+#define get_dl_info(i) __get_dl_info(i,__FILE__,__LINE__)
+
+static inline struct dl_info *__get_dl_info(struct dl_info *dli,
+       const char *_file, int _line)
+{
+       if (!dli)
+               return NULL;
+       vxdprintk("get_dl_info(%p[#%d.%d])\t%s:%d\n",
+               dli, dli?dli->dl_xid:0, dli?atomic_read(&dli->dl_usecnt):0,
+               _file, _line);
+       atomic_inc(&dli->dl_usecnt);
+       return dli;
+}
+
+
+#define        free_dl_info(i) \
+       call_rcu(&i->dl_rcu, rcu_free_dl_info, i);
+
+#define put_dl_info(i) __put_dl_info(i,__FILE__,__LINE__)
+
+static inline void __put_dl_info(struct dl_info *dli, const char *_file, int _line)
+{
+       if (!dli)
+               return;
+       vxdprintk("put_dl_info(%p[#%d.%d])\t%s:%d\n",
+               dli, dli?dli->dl_xid:0, dli?atomic_read(&dli->dl_usecnt):0,
+               _file, _line);
+       if (atomic_dec_and_test(&dli->dl_usecnt))
+               free_dl_info(dli);
+}
+
+
+extern int vx_debug_dlimit;
+
+#define        __dlimit_char(d)        ((d)?'*':' ')
+
+static inline int __dl_alloc_space(struct super_block *sb,
+       xid_t xid, dlsize_t nr, const char *file, int line)
+{
+       struct dl_info *dli = NULL;
+       int ret = 0;
+
+       if (nr == 0)
+               goto out;
+       dli = locate_dl_info(sb, xid);
+       if (!dli)
+               goto out;
+
+       spin_lock(&dli->dl_lock);
+       ret = (dli->dl_space_used + nr > dli->dl_space_total);
+       if (!ret)
+               dli->dl_space_used += nr;
+       spin_unlock(&dli->dl_lock);
+       put_dl_info(dli);
+out:
+       if (vx_debug_dlimit)
+               printk("ALLOC (%p,#%d)%c %lld bytes (%d)@ %s:%d\n",
+                       sb, xid, __dlimit_char(dli), nr, ret, file, line);
+       return ret;
+}
+
+static inline void __dl_free_space(struct super_block *sb,
+       xid_t xid, dlsize_t nr, const char *file, int line)
+{
+       struct dl_info *dli = NULL;
+
+       if (nr == 0)
+               goto out;
+       dli = locate_dl_info(sb, xid);
+       if (!dli)
+               goto out;
+
+       spin_lock(&dli->dl_lock);
+       dli->dl_space_used -= nr;
+       spin_unlock(&dli->dl_lock);
+       put_dl_info(dli);
+out:
+       if (vx_debug_dlimit)
+               printk("FREE  (%p,#%d)%c %lld bytes @ %s:%d\n",
+                       sb, xid, __dlimit_char(dli), nr, file, line);
+}
+
+static inline int __dl_alloc_inode(struct super_block *sb,
+       xid_t xid, const char *file, int line)
+{
+       struct dl_info *dli;
+       int ret = 0;
+
+       dli = locate_dl_info(sb, xid);
+       if (!dli)
+               goto out;
+
+       spin_lock(&dli->dl_lock);
+       ret = (dli->dl_inodes_used >= dli->dl_inodes_total);
+       if (!ret)
+               dli->dl_inodes_used++;
+       spin_unlock(&dli->dl_lock);
+       put_dl_info(dli);
+out:
+       if (vx_debug_dlimit)
+               printk("ALLOC (%p,#%d)%c inode (%d)@ %s:%d\n",
+                       sb, xid, __dlimit_char(dli), ret, file, line);
+       return ret;
+}
+
+static inline void __dl_free_inode(struct super_block *sb,
+       xid_t xid, const char *file, int line)
+{
+       struct dl_info *dli;
+
+       dli = locate_dl_info(sb, xid);
+       if (!dli)
+               goto out;
+
+       spin_lock(&dli->dl_lock);
+       dli->dl_inodes_used--;
+       spin_unlock(&dli->dl_lock);
+       put_dl_info(dli);
+out:
+       if (vx_debug_dlimit)
+               printk("FREE  (%p,#%d)%c inode @ %s:%d\n",
+                       sb, xid, __dlimit_char(dli), file, line);
+}
+
+
+
+#define DLIMIT_ALLOC_BLOCK(sb, xid, nr) \
+       __dl_alloc_space(sb, xid, \
+               ((dlsize_t)(nr)) << (sb)->s_blocksize_bits, \
+               __FILE__, __LINE__ )
+
+#define DLIMIT_FREE_BLOCK(sb, xid, nr) \
+       __dl_free_space(sb, xid, \
+               ((dlsize_t)(nr)) << (sb)->s_blocksize_bits, \
+               __FILE__, __LINE__ )
+
+#define DLIMIT_ALLOC_INODE(sb, xid) \
+       __dl_alloc_inode(sb, xid, __FILE__, __LINE__ )
+
+#define DLIMIT_FREE_INODE(sb, xid) \
+       __dl_free_inode(sb, xid, __FILE__, __LINE__ )
+
+
+#define        DLIMIT_ADJUST_BLOCK(sb, xid, fb, rb)
+
+
+#else
+#warning duplicate inclusion
+#endif
diff --git a/include/linux/vs_limit.h b/include/linux/vs_limit.h
new file mode 100644 (file)
index 0000000..82e8de4
--- /dev/null
@@ -0,0 +1,119 @@
+#ifndef _VX_VS_LIMIT_H
+#define _VX_VS_LIMIT_H
+
+
+// #define VX_DEBUG
+
+#include <linux/kernel.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+
+#include "vserver/context.h"
+#include "vserver/limit.h"
+
+
+/* file limits */
+
+#define VX_DEBUG_ACC_FILE      0
+#define VX_DEBUG_ACC_OPENFD    0
+
+#if    (VX_DEBUG_ACC_FILE) || (VX_DEBUG_ACC_OPENFD)
+#define vxdprintk(x...) printk("vxd: " x)
+#else
+#define vxdprintk(x...)
+#endif
+
+
+#define vx_acc_cres(v,d,r) \
+       __vx_acc_cres((v), (r), (d), __FILE__, __LINE__)
+
+static inline void __vx_acc_cres(struct vx_info *vxi,
+       int res, int dir, char *file, int line)
+{
+        if (vxi) {
+       if ((res == RLIMIT_NOFILE && VX_DEBUG_ACC_FILE) ||
+                       (res == RLIMIT_OPENFD && VX_DEBUG_ACC_OPENFD))
+       printk("vx_acc_cres[%5d,%2d]: %5d%s in %s:%d\n",
+                        (vxi?vxi->vx_id:-1), res,
+                        (vxi?atomic_read(&vxi->limit.rcur[res]):0),
+                       (dir>0)?"++":"--", file, line);
+                if (dir > 0)
+                        atomic_inc(&vxi->limit.rcur[res]);
+                else
+                        atomic_dec(&vxi->limit.rcur[res]);
+        }
+}
+
+#define vx_nproc_inc(p)        vx_acc_cres(current->vx_info, 1, RLIMIT_NPROC)
+#define vx_nproc_dec(p)        vx_acc_cres(current->vx_info,-1, RLIMIT_NPROC)
+
+#define vx_files_inc(f)        vx_acc_cres(current->vx_info, 1, RLIMIT_NOFILE)
+#define vx_files_dec(f)        vx_acc_cres(current->vx_info,-1, RLIMIT_NOFILE)
+
+#define vx_openfd_inc(f) vx_acc_cres(current->vx_info, 1, RLIMIT_OPENFD)
+#define vx_openfd_dec(f) vx_acc_cres(current->vx_info,-1, RLIMIT_OPENFD)
+
+/*
+#define vx_openfd_inc(f) do {                                  \
+       vx_acc_cres(current->vx_info, 1, RLIMIT_OPENFD);        \
+       printk("vx_openfd_inc: %d[#%d] in %s:%d\n",             \
+               f, current->xid, __FILE__, __LINE__);           \
+       } while (0)
+
+#define vx_openfd_dec(f) do {                                  \
+       vx_acc_cres(current->vx_info,-1, RLIMIT_OPENFD);        \
+       printk("vx_openfd_dec: %d[#%d] in %s:%d\n",             \
+               f, current->xid, __FILE__, __LINE__);           \
+       } while (0)
+*/
+
+#define vx_cres_avail(v,n,r) \
+        __vx_cres_avail((v), (r), (n), __FILE__, __LINE__)
+
+static inline int __vx_cres_avail(struct vx_info *vxi,
+                int res, int num, char *file, int line)
+{
+       unsigned long value;
+
+       if ((res == RLIMIT_NOFILE && VX_DEBUG_ACC_FILE) ||
+               (res == RLIMIT_OPENFD && VX_DEBUG_ACC_OPENFD))
+                printk("vx_cres_avail[%5d,%2d]: %5ld > %5d + %5d in %s:%d\n",
+                        (vxi?vxi->vx_id:-1), res,
+                       (vxi?vxi->limit.rlim[res]:1),
+                        (vxi?atomic_read(&vxi->limit.rcur[res]):0),
+                       num, file, line);
+        if (!vxi)
+                return 1;
+       value = atomic_read(&vxi->limit.rcur[res]);     
+       if (value > vxi->limit.rmax[res])
+               vxi->limit.rmax[res] = value;
+        if (vxi->limit.rlim[res] == RLIM_INFINITY)
+                return 1;
+        if (value + num <= vxi->limit.rlim[res])
+                return 1;
+       atomic_inc(&vxi->limit.lhit[res]);
+        return 0;
+}
+
+#define vx_nproc_avail(n) \
+       vx_cres_avail(current->vx_info, (n), RLIMIT_NPROC)
+
+#define vx_files_avail(n) \
+       vx_cres_avail(current->vx_info, (n), RLIMIT_NOFILE)
+
+#define vx_openfd_avail(n) \
+       vx_cres_avail(current->vx_info, (n), RLIMIT_OPENFD)
+
+
+/* socket limits */
+
+#define vx_sock_inc(f) vx_acc_cres(current->vx_info, 1, VLIMIT_SOCK)
+#define vx_sock_dec(f) vx_acc_cres(current->vx_info,-1, VLIMIT_SOCK)
+
+#define vx_sock_avail(n) \
+       vx_cres_avail(current->vx_info, (n), VLIMIT_SOCK)
+
+
+#else
+#warning duplicate inclusion
+#endif
diff --git a/include/linux/vs_memory.h b/include/linux/vs_memory.h
new file mode 100644 (file)
index 0000000..2fe9c08
--- /dev/null
@@ -0,0 +1,132 @@
+#ifndef _VX_VS_MEMORY_H
+#define _VX_VS_MEMORY_H
+
+
+// #define VX_DEBUG
+
+#include <linux/kernel.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+
+#include "vserver/context.h"
+#include "vserver/limit.h"
+
+
+#define VX_DEBUG_ACC_RSS   0
+#define VX_DEBUG_ACC_VM    0
+#define VX_DEBUG_ACC_VML   0
+
+#if    (VX_DEBUG_ACC_RSS) || (VX_DEBUG_ACC_VM) || (VX_DEBUG_ACC_VML)
+#define vxdprintk(x...) printk("vxd: " x)
+#else
+#define vxdprintk(x...)
+#endif
+
+#define vx_acc_page(m, d, v, r) \
+       __vx_acc_page(&(m->v), m->mm_vx_info, r, d, __FILE__, __LINE__)
+
+static inline void __vx_acc_page(unsigned long *v, struct vx_info *vxi,
+                int res, int dir, char *file, int line)
+{
+        if (v) {
+                if (dir > 0)
+                        ++(*v);
+                else
+                        --(*v);
+        }
+        if (vxi) {
+                if (dir > 0)
+                        atomic_inc(&vxi->limit.rcur[res]);
+                else
+                        atomic_dec(&vxi->limit.rcur[res]);
+        }
+}
+
+
+#define vx_acc_pages(m, p, v, r) \
+       __vx_acc_pages(&(m->v), m->mm_vx_info, r, p, __FILE__, __LINE__)
+
+static inline void __vx_acc_pages(unsigned long *v, struct vx_info *vxi,
+                int res, int pages, char *file, int line)
+{
+        if ((res == RLIMIT_RSS && VX_DEBUG_ACC_RSS) ||
+               (res == RLIMIT_AS && VX_DEBUG_ACC_VM) ||
+               (res == RLIMIT_MEMLOCK && VX_DEBUG_ACC_VML))
+               vxdprintk("vx_acc_pages  [%5d,%2d]: %5d += %5d in %s:%d\n",
+                       (vxi?vxi->vx_id:-1), res,
+                       (vxi?atomic_read(&vxi->limit.res[res]):0),
+                       pages, file, line);
+        if (pages == 0)
+                return;
+        if (v)
+                *v += pages;
+        if (vxi)
+                atomic_add(pages, &vxi->limit.rcur[res]);
+}
+
+
+
+#define vx_acc_vmpage(m,d)     vx_acc_page(m, d, total_vm,  RLIMIT_AS)
+#define vx_acc_vmlpage(m,d)    vx_acc_page(m, d, locked_vm, RLIMIT_MEMLOCK)
+#define vx_acc_rsspage(m,d)    vx_acc_page(m, d, rss,      RLIMIT_RSS)
+
+#define vx_acc_vmpages(m,p)    vx_acc_pages(m, p, total_vm,  RLIMIT_AS)
+#define vx_acc_vmlpages(m,p)   vx_acc_pages(m, p, locked_vm, RLIMIT_MEMLOCK)
+#define vx_acc_rsspages(m,p)   vx_acc_pages(m, p, rss,       RLIMIT_RSS)
+
+#define vx_pages_add(s,r,p)    __vx_acc_pages(0, s, r, p, __FILE__, __LINE__)
+#define vx_pages_sub(s,r,p)    __vx_pages_add(s, r, -(p))
+
+#define vx_vmpages_inc(m)      vx_acc_vmpage(m, 1)
+#define vx_vmpages_dec(m)      vx_acc_vmpage(m,-1)
+#define vx_vmpages_add(m,p)    vx_acc_vmpages(m, p)
+#define vx_vmpages_sub(m,p)    vx_acc_vmpages(m,-(p))
+
+#define vx_vmlocked_inc(m)     vx_acc_vmlpage(m, 1)
+#define vx_vmlocked_dec(m)     vx_acc_vmlpage(m,-1)
+#define vx_vmlocked_add(m,p)   vx_acc_vmlpages(m, p)
+#define vx_vmlocked_sub(m,p)   vx_acc_vmlpages(m,-(p))
+
+#define vx_rsspages_inc(m)     vx_acc_rsspage(m, 1)
+#define vx_rsspages_dec(m)     vx_acc_rsspage(m,-1)
+#define vx_rsspages_add(m,p)   vx_acc_rsspages(m, p)
+#define vx_rsspages_sub(m,p)   vx_acc_rsspages(m,-(p))
+
+
+
+#define vx_pages_avail(m, p, r) \
+        __vx_pages_avail((m)->mm_vx_info, (r), (p), __FILE__, __LINE__)
+
+static inline int __vx_pages_avail(struct vx_info *vxi,
+                int res, int pages, char *file, int line)
+{
+       unsigned long value;
+
+        if ((res == RLIMIT_RSS && VX_DEBUG_ACC_RSS) ||
+                (res == RLIMIT_AS && VX_DEBUG_ACC_VM) ||
+                (res == RLIMIT_MEMLOCK && VX_DEBUG_ACC_VML))
+                printk("vx_pages_avail[%5d,%2d]: %5ld > %5d + %5d in %s:%d\n",
+                        (vxi?vxi->vx_id:-1), res,
+                       (vxi?vxi->limit.rlim[res]:1),
+                        (vxi?atomic_read(&vxi->limit.rcur[res]):0),
+                       pages, file, line);
+        if (!vxi)
+                return 1;
+       value = atomic_read(&vxi->limit.rcur[res]);     
+       if (value > vxi->limit.rmax[res])
+               vxi->limit.rmax[res] = value;
+        if (vxi->limit.rlim[res] == RLIM_INFINITY)
+                return 1;
+        if (value + pages <= vxi->limit.rlim[res])
+                return 1;
+       atomic_inc(&vxi->limit.lhit[res]);
+        return 0;
+}
+
+#define vx_vmpages_avail(m,p)  vx_pages_avail(m, p, RLIMIT_AS)
+#define vx_vmlocked_avail(m,p) vx_pages_avail(m, p, RLIMIT_MEMLOCK)
+#define vx_rsspages_avail(m,p) vx_pages_avail(m, p, RLIMIT_RSS)
+
+#else
+#warning duplicate inclusion
+#endif
diff --git a/include/linux/vs_network.h b/include/linux/vs_network.h
new file mode 100644 (file)
index 0000000..0a3349c
--- /dev/null
@@ -0,0 +1,154 @@
+#ifndef _NX_VS_NETWORK_H
+#define _NX_VS_NETWORK_H
+
+
+// #define NX_DEBUG
+
+#include <linux/kernel.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+
+#include "vserver/network.h"
+
+#if defined(NX_DEBUG)
+#define nxdprintk(x...) printk("nxd: " x)
+#else
+#define nxdprintk(x...)
+#endif
+
+
+extern int proc_pid_nx_info(struct task_struct *, char *);
+
+
+#define get_nx_info(i) __get_nx_info(i,__FILE__,__LINE__)
+
+static inline struct nx_info *__get_nx_info(struct nx_info *nxi,
+       const char *_file, int _line)
+{
+       if (!nxi)
+               return NULL;
+       nxdprintk("get_nx_info(%p[#%d.%d])\t%s:%d\n",
+               nxi, nxi?nxi->nx_id:0, nxi?atomic_read(&nxi->nx_usecnt):0,
+               _file, _line);
+       atomic_inc(&nxi->nx_usecnt);
+       return nxi;
+}
+
+
+#define        free_nx_info(nxi)       \
+       call_rcu(&nxi->nx_rcu, rcu_free_nx_info, nxi);
+
+#define put_nx_info(i) __put_nx_info(i,__FILE__,__LINE__)
+
+static inline void __put_nx_info(struct nx_info *nxi, const char *_file, int _line)
+{
+       if (!nxi)
+               return;
+       nxdprintk("put_nx_info(%p[#%d.%d])\t%s:%d\n",
+               nxi, nxi?nxi->nx_id:0, nxi?atomic_read(&nxi->nx_usecnt):0,
+               _file, _line);
+       if (atomic_dec_and_test(&nxi->nx_usecnt))
+               free_nx_info(nxi);
+}
+
+
+#define set_nx_info(p,i) __set_nx_info(p,i,__FILE__,__LINE__)
+
+static inline void __set_nx_info(struct nx_info **nxp, struct nx_info *nxi,
+       const char *_file, int _line)
+{
+       BUG_ON(*nxp);
+       if (!nxi)
+               return;
+       nxdprintk("set_nx_info(%p[#%d.%d.%d])\t%s:%d\n",
+               nxi, nxi?nxi->nx_id:0,
+               nxi?atomic_read(&nxi->nx_usecnt):0,
+               nxi?atomic_read(&nxi->nx_refcnt):0,
+               _file, _line);
+       atomic_inc(&nxi->nx_refcnt);
+       *nxp = __get_nx_info(nxi, _file, _line);
+}
+
+#define        clr_nx_info(p)  __clr_nx_info(p,__FILE__,__LINE__)
+
+static inline void __clr_nx_info(struct nx_info **nxp,
+       const char *_file, int _line)
+{
+       struct nx_info *nxo = *nxp;
+
+       if (!nxo)
+               return;
+       nxdprintk("clr_nx_info(%p[#%d.%d.%d])\t%s:%d\n",
+               nxo, nxo?nxo->nx_id:0,
+               nxo?atomic_read(&nxo->nx_usecnt):0,
+               nxo?atomic_read(&nxo->nx_refcnt):0,
+               _file, _line);
+       *nxp = NULL;
+       wmb();
+       if (nxo && atomic_dec_and_test(&nxo->nx_refcnt))
+               unhash_nx_info(nxo);
+       __put_nx_info(nxo, _file, _line);
+}
+
+
+#define task_get_nx_info(i)    __task_get_nx_info(i,__FILE__,__LINE__)
+
+static __inline__ struct nx_info *__task_get_nx_info(struct task_struct *p,
+       const char *_file, int _line)
+{
+       struct nx_info *nxi;
+       
+       task_lock(p);
+       nxi = __get_nx_info(p->nx_info, _file, _line);
+       task_unlock(p);
+       return nxi;
+}
+
+#define nx_verify_info(p,i)    \
+       __nx_verify_info((p)->nx_info,i,__FILE__,__LINE__)
+
+static __inline__ void __nx_verify_info(
+       struct nx_info *ipa, struct nx_info *ipb,
+       const char *_file, int _line)
+{
+       if (ipa == ipb)
+               return;
+       printk(KERN_ERR "ip bad assumption (%p==%p) at %s:%d\n",
+               ipa, ipb, _file, _line);
+}
+
+
+#define nx_task_nid(t) ((t)->nid)
+
+#define nx_current_nid() nx_task_nid(current)
+
+#define nx_check(c,m)  __nx_check(nx_current_nid(),c,m)
+
+#define nx_weak_check(c,m)     ((m) ? nx_check(c,m) : 1)
+
+#undef nxdprintk
+#define nxdprintk(x...)
+
+
+#define __nx_flags(v,m,f)      (((v) & (m)) ^ (f))
+
+#define        __nx_task_flags(t,m,f) \
+       (((t) && ((t)->nx_info)) ? \
+               __nx_flags((t)->nx_info->nx_flags,(m),(f)) : 0)
+
+#define nx_current_flags() \
+       ((current->nx_info) ? current->nx_info->nx_flags : 0)
+
+#define nx_flags(m,f)  __nx_flags(nx_current_flags(),(m),(f))
+
+
+#define nx_current_ncaps() \
+       ((current->nx_info) ? current->nx_info->nx_ncaps : 0)
+
+#define nx_ncaps(c)    (nx_current_ncaps() & (c))
+
+
+
+#else
+#warning duplicate inclusion
+#endif
diff --git a/include/linux/vs_socket.h b/include/linux/vs_socket.h
new file mode 100644 (file)
index 0000000..4992458
--- /dev/null
@@ -0,0 +1,65 @@
+#ifndef _VX_VS_LIMIT_H
+#define _VX_VS_LIMIT_H
+
+
+// #define VX_DEBUG
+
+#include <linux/kernel.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+
+#include "vserver/context.h"
+#include "vserver/network.h"
+
+
+/* socket accounting */
+
+#include <linux/socket.h>
+
+static inline int vx_sock_type(int family)
+{
+       int type = 4;
+
+       if (family > 0 && family < 3)
+               type = family;
+       else if (family == PF_INET6)
+               type = 3;
+       return type;
+}
+
+#define vx_acc_sock(v,f,p,s) \
+       __vx_acc_sock((v), (f), (p), (s), __FILE__, __LINE__)
+
+static inline void __vx_acc_sock(struct vx_info *vxi,
+       int family, int pos, int size, char *file, int line)
+{
+        if (vxi) {
+               int type = vx_sock_type(family);
+
+               atomic_inc(&vxi->cacct.sock[type][pos].count);
+               atomic_add(size, &vxi->cacct.sock[type][pos].total);
+        }
+}
+
+#define vx_sock_recv(sk,s) \
+       vx_acc_sock((sk)->sk_vx_info, (sk)->sk_family, 0, (s))
+#define vx_sock_send(sk,s) \
+       vx_acc_sock((sk)->sk_vx_info, (sk)->sk_family, 1, (s))
+#define vx_sock_fail(sk,s) \
+       vx_acc_sock((sk)->sk_vx_info, (sk)->sk_family, 2, (s))
+
+
+#define        sock_vx_init(s)  do {           \
+       (s)->sk_xid = 0;                \
+       (s)->sk_vx_info = NULL;         \
+       } while (0)
+
+#define        sock_nx_init(s)  do {           \
+       (s)->sk_nid = 0;                \
+       (s)->sk_nx_info = NULL;         \
+       } while (0)
+
+
+#else
+#warning duplicate inclusion
+#endif
diff --git a/include/linux/vserver/dlimit.h b/include/linux/vserver/dlimit.h
new file mode 100644 (file)
index 0000000..74872ed
--- /dev/null
@@ -0,0 +1,83 @@
+#ifndef _VX_DLIMIT_H
+#define _VX_DLIMIT_H
+
+#include "switch.h"
+#include <linux/spinlock.h>
+
+/*  inode vserver commands */
+
+#define VCMD_add_dlimit                VC_CMD(DLIMIT, 1, 0)
+#define VCMD_rem_dlimit                VC_CMD(DLIMIT, 2, 0)
+
+#define VCMD_set_dlimit                VC_CMD(DLIMIT, 5, 0)
+#define VCMD_get_dlimit                VC_CMD(DLIMIT, 6, 0)
+
+
+struct  vcmd_ctx_dlimit_base_v0 {
+       const char __user *name;
+       uint32_t flags;
+};
+
+struct  vcmd_ctx_dlimit_v0 {
+       const char __user *name;
+       uint32_t space_used;                    /* used space in kbytes */
+       uint32_t space_total;                   /* maximum space in kbytes */
+       uint32_t inodes_used;                   /* used inodes */
+       uint32_t inodes_total;                  /* maximum inodes */
+       uint32_t reserved;                      /* reserved for root in % */
+       uint32_t flags;
+};
+
+#define CDLIM_UNSET             (0ULL)
+#define CDLIM_INFINITY          (~0ULL)
+#define CDLIM_KEEP              (~1ULL)
+
+
+#ifdef __KERNEL__
+
+struct super_block;
+
+struct dl_info {
+       struct hlist_node dl_hlist;             /* linked list of contexts */
+       struct rcu_head dl_rcu;                 /* the rcu head */
+       xid_t dl_xid;                           /* context id */
+       atomic_t dl_usecnt;                     /* usage count */
+       atomic_t dl_refcnt;                     /* reference count */
+
+       struct super_block *dl_sb;              /* associated superblock */
+
+//     struct rw_semaphore dl_sem;             /* protect the values */
+       spinlock_t dl_lock;                     /* protect the values */
+
+       uint64_t dl_space_used;                 /* used space in bytes */
+       uint64_t dl_space_total;                /* maximum space in bytes */
+       uint32_t dl_inodes_used;                /* used inodes */
+       uint32_t dl_inodes_total;               /* maximum inodes */
+
+       unsigned int dl_nrlmult;                /* non root limit mult */
+};
+
+extern void rcu_free_dl_info(void *);
+extern void unhash_dl_info(struct dl_info *);
+
+extern struct dl_info *locate_dl_info(struct super_block *, xid_t);
+
+
+struct kstatfs;
+
+extern void vx_vsi_statfs(struct super_block *, struct kstatfs *);
+
+
+extern int vc_add_dlimit(uint32_t, void __user *);
+extern int vc_rem_dlimit(uint32_t, void __user *);
+
+extern int vc_set_dlimit(uint32_t, void __user *);
+extern int vc_get_dlimit(uint32_t, void __user *);
+
+
+typedef        uint64_t dlsize_t;
+
+
+#endif /* __KERNEL__ */
+
+#endif /* _VX_DLIMIT_H */
diff --git a/kernel/vserver/dlimit.c b/kernel/vserver/dlimit.c
new file mode 100644 (file)
index 0000000..eb9282f
--- /dev/null
@@ -0,0 +1,439 @@
+/*
+ *  linux/kernel/vserver/dlimit.c
+ *
+ *  Virtual Server: Context Disk Limits
+ *
+ *  Copyright (C) 2004  Herbert Pötzl
+ *
+ *  V0.01  initial version
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/fs.h>
+#include <linux/namespace.h>
+#include <linux/namei.h>
+#include <linux/statfs.h>
+#include <linux/vserver/switch.h>
+#include <linux/vs_base.h>
+#include <linux/vs_context.h>
+#include <linux/vs_dlimit.h>
+
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+/*     __alloc_dl_info()
+
+       * allocate an initialized dl_info struct
+       * doesn't make it visible (hash)                        */
+
+static struct dl_info *__alloc_dl_info(struct super_block *sb, xid_t xid)
+{
+       struct dl_info *new = NULL;
+       
+       vxdprintk("alloc_dl_info(%p,%d)\n", sb, xid);
+
+       /* would this benefit from a slab cache? */
+       new = kmalloc(sizeof(struct dl_info), GFP_KERNEL);
+       if (!new)
+               return 0;
+
+       memset (new, 0, sizeof(struct dl_info));
+       new->dl_xid = xid;
+       new->dl_sb = sb;
+       INIT_RCU_HEAD(&new->dl_rcu);
+       INIT_HLIST_NODE(&new->dl_hlist);
+       spin_lock_init(&new->dl_lock);
+       atomic_set(&new->dl_refcnt, 0);
+       atomic_set(&new->dl_usecnt, 0);
+
+       /* rest of init goes here */
+
+       vxdprintk("alloc_dl_info(%p,%d) = %p\n", sb, xid, new);
+       return new;
+}
+
+/*     __dealloc_dl_info()
+
+       * final disposal of dl_info                             */
+
+static void __dealloc_dl_info(struct dl_info *dli)
+{
+       vxdprintk("dealloc_dl_info(%p)\n", dli);
+
+       dli->dl_hlist.next = LIST_POISON1;
+       dli->dl_xid = -1;
+       dli->dl_sb = 0;
+
+       BUG_ON(atomic_read(&dli->dl_usecnt));
+       BUG_ON(atomic_read(&dli->dl_refcnt));
+
+       kfree(dli);
+}
+
+
+/*     hash table for dl_info hash */
+
+#define        DL_HASH_SIZE    13
+
+struct hlist_head dl_info_hash[DL_HASH_SIZE];
+
+static spinlock_t dl_info_hash_lock = SPIN_LOCK_UNLOCKED;
+
+
+static inline unsigned int __hashval(struct super_block *sb, xid_t xid)
+{
+       return ((xid ^ (unsigned int)sb) % DL_HASH_SIZE);
+}
+
+
+
+/*     __hash_dl_info()
+
+       * add the dli to the global hash table
+       * requires the hash_lock to be held                     */
+
+static inline void __hash_dl_info(struct dl_info *dli)
+{
+       struct hlist_head *head;
+       
+       vxdprintk("__hash_dl_info: %p[#%d]\n", dli, dli->dl_xid);
+       get_dl_info(dli);
+       head = &dl_info_hash[__hashval(dli->dl_sb, dli->dl_xid)];
+       hlist_add_head_rcu(&dli->dl_hlist, head);
+}
+
+/*     __unhash_dl_info()
+
+       * remove the dli from the global hash table
+       * requires the hash_lock to be held                     */
+
+static inline void __unhash_dl_info(struct dl_info *dli)
+{
+       vxdprintk("__unhash_dl_info: %p[#%d]\n", dli, dli->dl_xid);
+       hlist_del_rcu(&dli->dl_hlist);
+       put_dl_info(dli);
+}
+
+
+#define hlist_for_each_rcu(pos, head) \
+       for (pos = (head)->first; pos && ({ prefetch(pos->next); 1;}); \
+               pos = pos->next, ({ smp_read_barrier_depends(); 0;}))
+
+
+/*     __lookup_dl_info()
+
+       * requires the rcu_read_lock()
+       * doesn't increment the dl_refcnt                       */
+
+static inline struct dl_info *__lookup_dl_info(struct super_block *sb, xid_t xid)
+{
+       struct hlist_head *head = &dl_info_hash[__hashval(sb, xid)];
+       struct hlist_node *pos;
+
+       hlist_for_each_rcu(pos, head) {
+               struct dl_info *dli =
+                       hlist_entry(pos, struct dl_info, dl_hlist);
+
+               if (dli->dl_xid == xid && dli->dl_sb == sb) {
+                       return dli;
+               }
+       }
+       return NULL;
+}
+
+
+struct dl_info *locate_dl_info(struct super_block *sb, xid_t xid)
+{
+        struct dl_info *dli;
+
+       rcu_read_lock();
+       dli = get_dl_info(__lookup_dl_info(sb, xid));
+       rcu_read_unlock();
+        return dli;
+}
+
+void rcu_free_dl_info(void *obj)
+{
+       struct dl_info *dli = obj;
+       int usecnt, refcnt;
+
+       BUG_ON(!dli);
+
+       usecnt = atomic_read(&dli->dl_usecnt);
+       BUG_ON(usecnt < 0);
+
+       refcnt = atomic_read(&dli->dl_refcnt);
+       BUG_ON(refcnt < 0);
+
+       if (!usecnt)
+               __dealloc_dl_info(dli);
+       else
+               printk("!!! rcu didn't free\n");
+}
+
+
+
+
+int vc_add_dlimit(uint32_t id, void __user *data)
+{
+       struct nameidata nd;
+       struct vcmd_ctx_dlimit_base_v0 vc_data;
+       int ret;
+
+       if (!vx_check(0, VX_ADMIN))
+               return -ENOSYS;
+       if (copy_from_user (&vc_data, data, sizeof(vc_data)))
+               return -EFAULT;
+
+       ret = user_path_walk_link(vc_data.name, &nd);
+       if (!ret) {
+               struct super_block *sb;
+               struct dl_info *dli;
+
+               ret = -EINVAL;
+               if (!nd.dentry->d_inode)
+                       goto out_release;
+               if (!(sb = nd.dentry->d_inode->i_sb))
+                       goto out_release;       
+               
+               dli = __alloc_dl_info(sb, id);
+               spin_lock(&dl_info_hash_lock);          
+
+               ret = -EEXIST;
+               if (__lookup_dl_info(sb, id))
+                       goto out_unlock;        
+               __hash_dl_info(dli);
+               dli = NULL;
+               ret = 0;
+
+       out_unlock:
+               spin_unlock(&dl_info_hash_lock);                
+               if (dli)
+                       __dealloc_dl_info(dli);
+       out_release:
+               path_release(&nd);
+       }
+       return ret;
+}
+
+
+int vc_rem_dlimit(uint32_t id, void __user *data)
+{
+       struct nameidata nd;
+       struct vcmd_ctx_dlimit_base_v0 vc_data;
+       int ret;
+
+       if (!vx_check(0, VX_ADMIN))
+               return -ENOSYS;
+       if (copy_from_user (&vc_data, data, sizeof(vc_data)))
+               return -EFAULT;
+
+       ret = user_path_walk_link(vc_data.name, &nd);
+       if (!ret) {
+               struct super_block *sb;
+               struct dl_info *dli;
+
+               ret = -EINVAL;
+               if (!nd.dentry->d_inode)
+                       goto out_release;
+               if (!(sb = nd.dentry->d_inode->i_sb))
+                       goto out_release;       
+               
+               spin_lock(&dl_info_hash_lock);          
+               dli = __lookup_dl_info(sb, id);
+
+               ret = -ESRCH;
+               if (!dli)
+                       goto out_unlock;
+               
+               __unhash_dl_info(dli);
+               ret = 0;
+
+       out_unlock:
+               spin_unlock(&dl_info_hash_lock);                
+       out_release:
+               path_release(&nd);
+       }
+       return ret;
+}
+
+
+int vc_set_dlimit(uint32_t id, void __user *data)
+{
+       struct nameidata nd;
+       struct vcmd_ctx_dlimit_v0 vc_data;
+       int ret;
+
+       if (!vx_check(0, VX_ADMIN))
+               return -ENOSYS;
+       if (copy_from_user (&vc_data, data, sizeof(vc_data)))
+               return -EFAULT;
+
+       ret = user_path_walk_link(vc_data.name, &nd);
+       if (!ret) {
+               struct super_block *sb;
+               struct dl_info *dli;
+
+               ret = -EINVAL;
+               if (!nd.dentry->d_inode)
+                       goto out_release;
+               if (!(sb = nd.dentry->d_inode->i_sb))
+                       goto out_release;       
+               if (vc_data.reserved > 100 ||
+                       vc_data.inodes_used > vc_data.inodes_total ||
+                       vc_data.space_used > vc_data.space_total)
+                       goto out_release;
+
+               ret = -ESRCH;
+               dli = locate_dl_info(sb, id);
+               if (!dli)
+                       goto out_release;
+
+               spin_lock(&dli->dl_lock);               
+
+               if (vc_data.inodes_used != (uint32_t)CDLIM_KEEP)
+                       dli->dl_inodes_used = vc_data.inodes_used;
+               if (vc_data.inodes_total != (uint32_t)CDLIM_KEEP)
+                       dli->dl_inodes_total = vc_data.inodes_total;
+               if (vc_data.space_used != (uint32_t)CDLIM_KEEP) {
+                       dli->dl_space_used = vc_data.space_used;
+                       dli->dl_space_used <<= 10;
+               }
+               if (vc_data.space_total == (uint32_t)CDLIM_INFINITY)
+                       dli->dl_space_total = (uint64_t)CDLIM_INFINITY;
+               else if (vc_data.space_total != (uint32_t)CDLIM_KEEP) {
+                       dli->dl_space_total = vc_data.space_total;
+                       dli->dl_space_total <<= 10;
+               }
+               if (vc_data.reserved != (uint32_t)CDLIM_KEEP)
+                       dli->dl_nrlmult = (1 << 10) * (100 - vc_data.reserved) / 100;
+
+               spin_unlock(&dli->dl_lock);             
+               
+               put_dl_info(dli);
+               ret = 0;
+
+       out_release:
+               path_release(&nd);
+       }
+       return ret;
+}
+
+int vc_get_dlimit(uint32_t id, void __user *data)
+{
+       struct nameidata nd;
+       struct vcmd_ctx_dlimit_v0 vc_data;
+       int ret;
+
+       if (!vx_check(0, VX_ADMIN))
+               return -ENOSYS;
+       if (copy_from_user (&vc_data, data, sizeof(vc_data)))
+               return -EFAULT;
+
+       ret = user_path_walk_link(vc_data.name, &nd);
+       if (!ret) {
+               struct super_block *sb;
+               struct dl_info *dli;
+
+               ret = -EINVAL;
+               if (!nd.dentry->d_inode)
+                       goto out_release;
+               if (!(sb = nd.dentry->d_inode->i_sb))
+                       goto out_release;       
+               if (vc_data.reserved > 100 ||
+                       vc_data.inodes_used > vc_data.inodes_total ||
+                       vc_data.space_used > vc_data.space_total)
+                       goto out_release;
+
+               ret = -ESRCH;
+               dli = locate_dl_info(sb, id);
+               if (!dli)
+                       goto out_release;
+
+               spin_lock(&dli->dl_lock);               
+               vc_data.inodes_used = dli->dl_inodes_used;
+               vc_data.inodes_total = dli->dl_inodes_total;
+               vc_data.space_used = dli->dl_space_used >> 10;
+               if (dli->dl_space_total == (uint64_t)CDLIM_INFINITY)
+                       vc_data.space_total = (uint32_t)CDLIM_INFINITY;
+               else
+                       vc_data.space_total = dli->dl_space_total >> 10;
+
+               vc_data.reserved = 100 - ((dli->dl_nrlmult * 100 + 512) >> 10);
+               spin_unlock(&dli->dl_lock);             
+               
+               put_dl_info(dli);
+               ret = -EFAULT;
+               if (copy_to_user(data, &vc_data, sizeof(vc_data)))
+                       goto out_release;
+
+               ret = 0;
+       out_release:
+               path_release(&nd);
+       }
+       return ret;
+}
+
+
+void vx_vsi_statfs(struct super_block *sb, struct kstatfs *buf)
+{
+       struct dl_info *dli;
+        __u64 blimit, bfree, bavail;
+        __u32 ifree;
+               
+       dli = locate_dl_info(sb, current->xid);
+       if (!dli)
+               return;
+
+       spin_lock(&dli->dl_lock);
+       if (dli->dl_inodes_total == (uint32_t)CDLIM_INFINITY)
+               goto no_ilim;
+
+       /* reduce max inodes available to limit */
+       if (buf->f_files > dli->dl_inodes_total)
+               buf->f_files = dli->dl_inodes_total;
+
+       ifree = dli->dl_inodes_total - dli->dl_inodes_used;
+       /* reduce free inodes to min */
+       if (ifree < buf->f_ffree)
+               buf->f_ffree = ifree;
+
+no_ilim:
+       if (dli->dl_space_total == (uint64_t)CDLIM_INFINITY)
+               goto no_blim;
+
+       blimit = dli->dl_space_total >> sb->s_blocksize_bits;
+
+       if (dli->dl_space_total < dli->dl_space_used)
+               bfree = 0;
+       else
+               bfree = (dli->dl_space_total - dli->dl_space_used)
+                       >> sb->s_blocksize_bits;
+
+       bavail = ((dli->dl_space_total >> 10) * dli->dl_nrlmult);
+       if (bavail < dli->dl_space_used)
+               bavail = 0;
+       else
+               bavail = (bavail - dli->dl_space_used)
+                       >> sb->s_blocksize_bits;
+
+       /* reduce max space available to limit */
+       if (buf->f_blocks > blimit)
+               buf->f_blocks = blimit;
+
+       /* reduce free space to min */
+       if (bfree < buf->f_bfree)
+               buf->f_bfree = bfree;
+
+       /* reduce avail space to min */
+       if (bavail < buf->f_bavail)
+               buf->f_bavail = bavail;
+
+no_blim:
+       spin_unlock(&dli->dl_lock);
+       put_dl_info(dli);
+       
+       return; 
+}
+
diff --git a/kernel/vserver/helper.c b/kernel/vserver/helper.c
new file mode 100644 (file)
index 0000000..880b843
--- /dev/null
@@ -0,0 +1,92 @@
+/*
+ *  linux/kernel/vserver/helper.c
+ *
+ *  Virtual Context Support
+ *
+ *  Copyright (C) 2004  Herbert Pötzl
+ *
+ *  V0.01  basic helper
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/reboot.h>
+#include <linux/kmod.h>
+#include <linux/vserver.h>
+#include <linux/vs_base.h>
+#include <linux/vs_context.h>
+
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+
+
+char vshelper_path[255] = "/sbin/vshelper";
+
+
+/*
+ *      vshelper path is set via /proc/sys
+ *      invoked by vserver sys_reboot(), with
+ *      the following arguments
+ *
+ *      argv [0] = vshelper_path;
+ *      argv [1] = action: "restart", "halt", "poweroff", ...
+ *      argv [2] = context identifier
+ *      argv [3] = additional argument (restart2)
+ *
+ *      envp [*] = type-specific parameters
+ */
+
+long vs_reboot(unsigned int cmd, void * arg)
+{
+       char id_buf[8], cmd_buf[32];
+       char uid_buf[32], pid_buf[32];
+       char buffer[256];
+
+       char *argv[] = {vshelper_path, NULL, id_buf, NULL, 0};
+       char *envp[] = {"HOME=/", "TERM=linux",
+                       "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+                       uid_buf, pid_buf, cmd_buf, 0};
+
+       snprintf(id_buf, sizeof(id_buf)-1, "%d", vx_current_xid());
+
+       snprintf(cmd_buf, sizeof(cmd_buf)-1, "VS_CMD=%08x", cmd);
+       snprintf(uid_buf, sizeof(uid_buf)-1, "VS_UID=%d", current->uid);
+       snprintf(pid_buf, sizeof(pid_buf)-1, "VS_PID=%d", current->pid);
+
+       switch (cmd) {
+       case LINUX_REBOOT_CMD_RESTART:
+               argv[1] = "restart";
+               break;  
+
+       case LINUX_REBOOT_CMD_HALT:
+               argv[1] = "halt";
+               break;  
+
+       case LINUX_REBOOT_CMD_POWER_OFF:
+               argv[1] = "poweroff";
+               break;  
+
+       case LINUX_REBOOT_CMD_SW_SUSPEND:
+               argv[1] = "swsusp";
+               break;  
+
+       case LINUX_REBOOT_CMD_RESTART2:
+               if (strncpy_from_user(&buffer[0], (char *)arg, sizeof(buffer) - 1) < 0)
+                       return -EFAULT;
+               argv[3] = buffer;
+       default:
+               argv[1] = "restart2";
+               break;  
+       }
+
+       /* maybe we should wait ? */
+       if (call_usermodehelper(*argv, argv, envp, 0)) {
+               printk( KERN_WARNING
+                       "vs_reboot(): failed to exec (%s %s %s %s)\n",
+                       vshelper_path, argv[1], argv[2], argv[3]);
+               return -EPERM;
+       }
+       return 0;
+}
+