Initial revision
authorMarc Fiuczynski <mef@cs.princeton.edu>
Wed, 16 Jun 2004 18:15:57 +0000 (18:15 +0000)
committerMarc Fiuczynski <mef@cs.princeton.edu>
Wed, 16 Jun 2004 18:15:57 +0000 (18:15 +0000)
57 files changed:
Documentation/filesystems/relayfs.txt [new file with mode: 0644]
fs/rcfs/Makefile [new file with mode: 0644]
fs/rcfs/dir.c [new file with mode: 0644]
fs/rcfs/inode.c [new file with mode: 0644]
fs/rcfs/magic.c [new file with mode: 0644]
fs/rcfs/rootdir.c [new file with mode: 0644]
fs/rcfs/socket_fs.c [new file with mode: 0644]
fs/rcfs/super.c [new file with mode: 0644]
fs/rcfs/tc_magic.c [new file with mode: 0644]
fs/relayfs/Makefile [new file with mode: 0644]
fs/relayfs/inode.c [new file with mode: 0644]
fs/relayfs/klog.c [new file with mode: 0644]
fs/relayfs/relay.c [new file with mode: 0644]
fs/relayfs/relay_locking.c [new file with mode: 0644]
fs/relayfs/relay_locking.h [new file with mode: 0644]
fs/relayfs/relay_lockless.c [new file with mode: 0644]
fs/relayfs/relay_lockless.h [new file with mode: 0644]
fs/relayfs/resize.c [new file with mode: 0644]
fs/relayfs/resize.h [new file with mode: 0644]
include/asm-alpha/relay.h [new file with mode: 0644]
include/asm-arm/relay.h [new file with mode: 0644]
include/asm-arm26/relay.h [new file with mode: 0644]
include/asm-cris/relay.h [new file with mode: 0644]
include/asm-generic/relay.h [new file with mode: 0644]
include/asm-h8300/relay.h [new file with mode: 0644]
include/asm-i386/relay.h [new file with mode: 0644]
include/asm-ia64/relay.h [new file with mode: 0644]
include/asm-m68k/relay.h [new file with mode: 0644]
include/asm-m68knommu/relay.h [new file with mode: 0644]
include/asm-mips/relay.h [new file with mode: 0644]
include/asm-mips64/relay.h [new file with mode: 0644]
include/asm-parisc/relay.h [new file with mode: 0644]
include/asm-ppc/relay.h [new file with mode: 0644]
include/asm-ppc64/relay.h [new file with mode: 0644]
include/asm-s390/relay.h [new file with mode: 0644]
include/asm-sh/relay.h [new file with mode: 0644]
include/asm-sparc/relay.h [new file with mode: 0644]
include/asm-sparc64/relay.h [new file with mode: 0644]
include/asm-v850/relay.h [new file with mode: 0644]
include/asm-x86_64/relay.h [new file with mode: 0644]
include/linux/ckrm.h [new file with mode: 0644]
include/linux/ckrm_ce.h [new file with mode: 0644]
include/linux/ckrm_net.h [new file with mode: 0644]
include/linux/ckrm_rc.h [new file with mode: 0644]
include/linux/ckrm_tc.h [new file with mode: 0644]
include/linux/ckrm_tsk.h [new file with mode: 0644]
include/linux/klog.h [new file with mode: 0644]
include/linux/rcfs.h [new file with mode: 0644]
include/linux/relayfs_fs.h [new file with mode: 0644]
include/linux/taskdelays.h [new file with mode: 0644]
kernel/ckrm/Makefile [new file with mode: 0644]
kernel/ckrm/ckrm.c [new file with mode: 0644]
kernel/ckrm/ckrm_listenaq.c [new file with mode: 0644]
kernel/ckrm/ckrm_sockc.c [new file with mode: 0644]
kernel/ckrm/ckrm_tasks.c [new file with mode: 0644]
kernel/ckrm/ckrm_tc.c [new file with mode: 0644]
kernel/ckrm/ckrmutils.c [new file with mode: 0644]

diff --git a/Documentation/filesystems/relayfs.txt b/Documentation/filesystems/relayfs.txt
new file mode 100644 (file)
index 0000000..7397bdb
--- /dev/null
@@ -0,0 +1,812 @@
+
+relayfs - a high-speed data relay filesystem
+============================================
+
+relayfs is a filesystem designed to provide an efficient mechanism for
+tools and facilities to relay large amounts of data from kernel space
+to user space.
+
+The main idea behind relayfs is that every data flow is put into a
+separate "channel" and each channel is a file.  In practice, each
+channel is a separate memory buffer allocated from within kernel space
+upon channel instantiation. Software needing to relay data to user
+space would open a channel or a number of channels, depending on its
+needs, and would log data to that channel. All the buffering and
+locking mechanics are taken care of by relayfs.  The actual format and
+protocol used for each channel is up to relayfs' clients.
+
+relayfs makes no provisions for copying the same data to more than a
+single channel. This is for the clients of the relay to take care of,
+and so is any form of data filtering. The purpose is to keep relayfs
+as simple as possible.
+
+
+Usage
+=====
+
+In addition to the relayfs kernel API described below, relayfs
+implements basic file operations.  Here are the file operations that
+are available and some comments regarding their behavior:
+
+open()  enables user to open an _existing_ channel.  A channel can be
+        opened in blocking or non-blocking mode, and can be opened
+        for reading as well as for writing.  Readers will by default
+        be auto-consuming.
+
+mmap()  results in channel's memory buffer being mmapped into the
+        caller's memory space.
+
+read()  since we are dealing with circular buffers, the user is only
+        allowed to read forward.  Some apps may want to loop around
+        read() waiting for incoming data - if there is no data
+        available, read will put the reader on a wait queue until
+        data is available (blocking mode).  Non-blocking reads return
+        -EAGAIN if data is not available.
+
+
+write()         writing from user space operates exactly as relay_write() does
+        (described below).
+
+poll() POLLIN/POLLRDNORM/POLLOUT/POLLWRNORM/POLLERR supported.
+
+close()  decrements the channel's refcount.  When the refcount reaches
+        0 i.e. when no process or kernel client has the file open
+        (see relay_close() below), the channel buffer is freed.
+
+
+In order for a user application to make use of relayfs files, the
+relayfs filesystem must be mounted.  For example,
+
+       mount -t relayfs relayfs /mountpoint
+
+
+The relayfs kernel API
+======================
+
+relayfs channels are implemented as circular buffers subdivided into
+'sub-buffers'.  kernel clients write data into the channel using
+relay_write(), and are notified via a set of callbacks when
+significant events occur within the channel.  'Significant events'
+include:
+
+- a sub-buffer has been filled i.e. the current write won't fit into the
+  current sub-buffer, and a 'buffer-switch' is triggered, after which
+  the data is written into the next buffer (if the next buffer is
+  empty).  The client is notified of this condition via two callbacks,
+  one providing an opportunity to perform start-of-buffer tasks, the
+  other end-of-buffer tasks.
+
+- data is ready for the client to process.  The client can choose to
+  be notified either on a per-sub-buffer basis (bulk delivery) or
+  per-write basis (packet delivery).
+
+- data has been written to the channel from user space.  The client can
+  use this notification to accept and process 'commands' sent to the
+  channel via write(2).
+
+- the channel has been opened/closed/mapped/unmapped from user space.
+  The client can use this notification to trigger actions within the
+  kernel application, such as enabling/disabling logging to the
+  channel.  It can also return result codes from the callback,
+  indicating that the operation should fail e.g. in order to restrict
+  more than one user space open or mmap.
+
+- the channel needs resizing, or needs to update its
+  state based on the results of the resize.  Resizing the channel is
+  up to the kernel client to actually perform.  If the channel is
+  configured for resizing, the client is notified when the unread data
+  in the channel passes a preset threshold, giving it the opportunity
+  to allocate a new channel buffer and replace the old one.
+
+Reader objects
+--------------
+
+Channel readers use an opaque rchan_reader object to read from
+channels.  For VFS readers (those using read(2) to read from a
+channel), these objects are automatically created and used internally;
+only kernel clients that need to directly read from channels, or whose
+userspace applications use mmap to access channel data, need to know
+anything about rchan_readers - others may skip this section.
+
+A relay channel can have any number of readers, each represented by an
+rchan_reader instance, which is used to encapsulate reader settings
+and state.  rchan_reader objects should be treated as opaque by kernel
+clients.  To create a reader object for directly accessing a channel
+from kernel space, call the add_rchan_reader() kernel API function:
+
+rchan_reader *add_rchan_reader(rchan_id, auto_consume)
+
+This function returns an rchan_reader instance if successful, which
+should then be passed to relay_read() when the kernel client is
+interested in reading from the channel.
+
+The auto_consume parameter indicates whether a read done by this
+reader will automatically 'consume' that portion of the unread channel
+buffer when relay_read() is called (see below for more details).
+
+To close the reader, call
+
+remove_rchan_reader(reader)
+
+which will remove the reader from the list of current readers.
+
+
+To create a reader object representing a userspace mmap reader in the
+kernel application, call the add_map_reader() kernel API function:
+
+rchan_reader *add_map_reader(rchan_id)
+
+This function returns an rchan_reader instance if successful, whose
+main purpose is as an argument to be passed into
+relay_buffers_consumed() when the kernel client becomes aware that
+data has been read by a user application using mmap to read from the
+channel buffer.  There is no auto_consume option in this case, since
+only the kernel client/user application knows when data has been read.
+
+To close the map reader, call
+
+remove_map_reader(reader)
+
+which will remove the reader from the list of current readers.
+
+Consumed count
+--------------
+
+A relayfs channel is a circular buffer, which means that if there is
+no reader reading from it or a reader reading too slowly, at some
+point the channel writer will 'lap' the reader and data will be lost.
+In normal use, readers will always be able to keep up with writers and
+the buffer is thus never in danger of becoming full.  In many
+applications, it's sufficient to ensure that this is practically
+speaking always the case, by making the buffers large enough.  These
+types of applications can basically open the channel as
+RELAY_MODE_CONTINOUS (the default anyway) and not worry about the
+meaning of 'consume' and skip the rest of this section.
+
+If it's important for the application that a kernel client never allow
+writers to overwrite unread data, the channel should be opened using
+RELAY_MODE_NO_OVERWRITE and must be kept apprised of the count of
+bytes actually read by the (typically) user-space channel readers.
+This count is referred to as the 'consumed count'.  read(2) channel
+readers automatically update the channel's 'consumed count' as they
+read.  If the usage mode is to have only read(2) readers, which is
+typically the case, the kernel client doesn't need to worry about any
+of the relayfs functions having to do with 'bytes consumed' and can
+skip the rest of this section.  (Note that it is possible to have
+multiple read(2) or auto-consuming readers, but like having multiple
+readers on a pipe, these readers will race with each other i.e. it's
+supported, but doesn't make much sense).
+
+If the kernel client cannot rely on an auto-consuming reader to keep
+the 'consumed count' up-to-date, then it must do so manually, by
+making the appropriate calls to relay_buffers_consumed() or
+relay_bytes_consumed().  In most cases, this should only be necessary
+for bulk mmap clients - almost all packet clients should be covered by
+having auto-consuming read(2) readers.  For mmapped bulk clients, for
+instance, there are no auto-consuming VFS readers, so the kernel
+client needs to make the call to relay_buffers_consumed() after
+sub-buffers are read.
+
+Kernel API
+----------
+
+Here's a summary of the API relayfs provides to in-kernel clients:
+
+int    relay_open(channel_path, bufsize, nbufs, channel_flags,
+                 channel_callbacks, start_reserve, end_reserve,
+                 rchan_start_reserve, resize_min, resize_max, mode,
+                 init_buf, init_buf_size)
+int    relay_write(channel_id, *data_ptr, count, time_delta_offset, **wrote)
+rchan_reader *add_rchan_reader(channel_id, auto_consume)
+int    remove_rchan_reader(rchan_reader *reader)
+rchan_reader *add_map_reader(channel_id)
+int    remove_map_reader(rchan_reader *reader)
+int    relay_read(reader, buf, count, wait, *actual_read_offset)
+void   relay_buffers_consumed(reader, buffers_consumed)
+void   relay_bytes_consumed(reader, bytes_consumed, read_offset)
+int    relay_bytes_avail(reader)
+int    rchan_full(reader)
+int    rchan_empty(reader)
+int    relay_info(channel_id, *channel_info)
+int    relay_close(channel_id)
+int    relay_realloc_buffer(channel_id, nbufs, async)
+int    relay_replace_buffer(channel_id)
+int    relay_reset(int rchan_id)
+
+----------
+int relay_open(channel_path, bufsize, nbufs, 
+        channel_flags, channel_callbacks, start_reserve,
+        end_reserve, rchan_start_reserve, resize_min, resize_max, mode)
+
+relay_open() is used to create a new entry in relayfs.  This new entry
+is created according to channel_path.  channel_path contains the
+absolute path to the channel file on relayfs.  If, for example, the
+caller sets channel_path to "/xlog/9", a "xlog/9" entry will appear
+within relayfs automatically and the "xlog" directory will be created
+in the filesystem's root.  relayfs does not implement any policy on
+its content, except to disallow the opening of two channels using the
+same file. There are, nevertheless a set of guidelines for using
+relayfs. Basically, each facility using relayfs should use a top-level
+directory identifying it. The entry created above, for example,
+presumably belongs to the "xlog" software.
+
+The remaining parameters for relay_open() are as follows:
+
+- channel_flags - an ORed combination of attribute values controlling
+  common channel characteristics:
+
+       - logging scheme - relayfs use 2 mutually exclusive schemes
+         for logging data to a channel.  The 'lockless scheme'
+         reserves and writes data to a channel without the need of
+         any type of locking on the channel.  This is the preferred
+         scheme, but may not be available on a given architecture (it
+         relies on the presence of a cmpxchg instruction).  It's
+         specified by the RELAY_SCHEME_LOCKLESS flag.  The 'locking
+         scheme' either obtains a lock on the channel for writing or
+         disables interrupts, depending on whether the channel was
+         opened for SMP or global usage (see below).  It's specified
+         by the RELAY_SCHEME_LOCKING flag.  While a client may want
+         to explicitly specify a particular scheme to use, it's more
+         convenient to specify RELAY_SCHEME_ANY for this flag, which
+         will allow relayfs to choose the best available scheme i.e.
+         lockless if supported.
+
+       - overwrite mode (default is RELAY_MODE_CONTINUOUS) -
+        If RELAY_MODE_CONTINUOUS is specified, writes to the channel
+        will succeed regardless of whether there are up-to-date
+        consumers or not.  If RELAY_MODE_NO_OVERWRITE is specified,
+        the channel becomes 'full' when the total amount of buffer
+        space unconsumed by readers equals or exceeds the total
+        buffer size.  With the buffer in this state, writes to the
+        buffer will fail - clients need to check the return code from
+        relay_write() to determine if this is the case and act
+        accordingly - 0 or a negative value indicate the write failed.
+
+       - SMP usage - this applies only when the locking scheme is in
+        use.  If RELAY_USAGE_SMP is specified, it's assumed that the
+        channel will be used in a per-CPU fashion and consequently,
+        the only locking that will be done for writes is to disable
+        local irqs.  If RELAY_USAGE_GLOBAL is specified, it's assumed
+        that writes to the buffer can occur within any CPU context,
+        and spinlock_irq_save will be used to lock the buffer.
+
+       - delivery mode - if RELAY_DELIVERY_BULK is specified, the
+        client will be notified via its deliver() callback whenever a
+        sub-buffer has been filled.  Alternatively,
+        RELAY_DELIVERY_PACKET will cause delivery to occur after the
+        completion of each write.  See the description of the channel
+        callbacks below for more details.
+
+       - timestamping - if RELAY_TIMESTAMP_TSC is specified and the
+        architecture supports it, efficient TSC 'timestamps' can be
+        associated with each write, otherwise more expensive
+        gettimeofday() timestamping is used.  At the beginning of
+        each sub-buffer, a gettimeofday() timestamp and the current
+        TSC, if supported, are read, and are passed on to the client
+        via the buffer_start() callback.  This allows correlation of
+        the current time with the current TSC for subsequent writes.
+        Each subsequent write is associated with a 'time delta',
+        which is either the current TSC, if the channel is using
+        TSCs, or the difference between the buffer_start gettimeofday
+        timestamp and the gettimeofday time read for the current
+        write.  Note that relayfs never writes either a timestamp or
+        time delta into the buffer unless explicitly asked to (see
+        the description of relay_write() for details).
+- bufsize - the size of the 'sub-buffers' making up the circular channel
+  buffer.  For the lockless scheme, this must be a power of 2.
+
+- nbufs - the number of 'sub-buffers' making up the circular
+  channel buffer.  This must be a power of 2.
+
+  The total size of the channel buffer is bufsize * nbufs rounded up 
+  to the next kernel page size.  If the lockless scheme is used, both
+  bufsize and nbufs must be a power of 2.  If the locking scheme is
+  used, the bufsize can be anything and nbufs must be a power of 2.  If
+  RELAY_SCHEME_ANY is used, the bufsize and nbufs should be a power of 2.
+
+  NOTE: if nbufs is 1, relayfs will bypass the normal size
+  checks and will allocate an rvmalloced buffer of size bufsize.
+  This buffer will be freed when relay_close() is called, if the channel
+  isn't still being referenced.
+
+- callbacks - a table of callback functions called when events occur
+  within the data relay that clients need to know about:
+          
+         - int buffer_start(channel_id, current_write_pos, buffer_id,
+           start_time, start_tsc, using_tsc) -
+
+           called at the beginning of a new sub-buffer, the
+           buffer_start() callback gives the client an opportunity to
+           write data into space reserved at the beginning of a
+           sub-buffer.  The client should only write into the buffer
+           if it specified a value for start_reserve and/or
+           channel_start_reserve (see below) when the channel was
+           opened.  In the latter case, the client can determine
+           whether to write its one-time rchan_start_reserve data by
+           examining the value of buffer_id, which will be 0 for the
+           first sub-buffer.  The address that the client can write
+           to is contained in current_write_pos (the client by
+           definition knows how much it can write i.e. the value it
+           passed to relay_open() for start_reserve/
+           channel_start_reserve).  start_time contains the
+           gettimeofday() value for the start of the buffer and start
+           TSC contains the TSC read at the same time.  The using_tsc
+           param indicates whether or not start_tsc is valid (it
+           wouldn't be if TSC timestamping isn't being used).
+
+           The client should return the number of bytes it wrote to
+           the channel, 0 if none.
+
+         - int buffer_end(channel_id, current_write_pos, end_of_buffer,
+           end_time, end_tsc, using_tsc)
+
+           called at the end of a sub-buffer, the buffer_end()
+           callback gives the client an opportunity to perform
+           end-of-buffer processing.  Note that the current_write_pos
+           is the position where the next write would occur, but
+           since the current write wouldn't fit (which is the trigger
+           for the buffer_end event), the buffer is considered full
+           even though there may be unused space at the end.  The
+           end_of_buffer param pointer value can be used to determine
+           exactly the size of the unused space.  The client should
+           only write into the buffer if it specified a value for
+           end_reserve when the channel was opened.  If the client
+           doesn't write anything i.e. returns 0, the unused space at
+           the end of the sub-buffer is available via relay_info() -
+           this data may be needed by the client later if it needs to
+           process raw sub-buffers (an alternative would be to save
+           the unused bytes count value in end_reserve space at the
+           end of each sub-buffer during buffer_end processing and
+           read it when needed at a later time.  The other
+           alternative would be to use read(2), which makes the
+           unused count invisible to the caller).  end_time contains
+           the gettimeofday() value for the end of the buffer and end
+           TSC contains the TSC read at the same time.  The using_tsc
+           param indicates whether or not end_tsc is valid (it
+           wouldn't be if TSC timestamping isn't being used).
+
+           The client should return the number of bytes it wrote to
+           the channel, 0 if none.
+
+         - void deliver(channel_id, from, len)
+
+           called when data is ready for the client.  This callback
+           is used to notify a client when a sub-buffer is complete
+           (in the case of bulk delivery) or a single write is
+           complete (packet delivery).  A bulk delivery client might
+           wish to then signal a daemon that a sub-buffer is ready.
+           A packet delivery client might wish to process the packet
+           or send it elsewhere.  The from param is a pointer to the
+           delivered data and len specifies how many bytes are ready.
+
+         - void user_deliver(channel_id, from, len)
+
+           called when data has been written to the channel from user
+           space.  This callback is used to notify a client when a
+           successful write from userspace has occurred, independent
+           of whether bulk or packet delivery is in use.  This can be
+           used to allow userspace programs to communicate with the
+           kernel client through the channel via out-of-band write(2)
+           'commands' instead of via ioctls, for instance.  The from
+           param is a pointer to the delivered data and len specifies
+           how many bytes are ready.  Note that this callback occurs
+           after the bytes have been successfully written into the
+           channel, which means that channel readers must be able to
+           deal with the 'command' data which will appear in the
+           channel data stream just as any other userspace or
+           non-userspace write would.
+
+         - int needs_resize(channel_id, resize_type,
+                            suggested_buf_size, suggested_n_bufs)
+
+           called when a channel's buffers are in danger of becoming
+           full i.e. the number of unread bytes in the channel passes
+           a preset threshold, or when the current capacity of a
+           channel's buffer is no longer needed.  Also called to
+           notify the client when a channel's buffer has been
+           replaced.  If resize_type is RELAY_RESIZE_EXPAND or
+           RELAY_RESIZE_SHRINK, the kernel client should arrange to
+           call relay_realloc_buffer() with the suggested buffer size
+           and buffer count, which will allocate (but will not
+           replace the old one) a new buffer of the recommended size
+           for the channel.  When the allocation has completed,
+           needs_resize() is again called, this time with a
+           resize_type of RELAY_RESIZE_REPLACE.  The kernel client
+           should then arrange to call relay_replace_buffer() to
+           actually replace the old channel buffer with the newly
+           allocated buffer.  Finally, once the buffer replacement
+           has completed, needs_resize() is again called, this time
+           with a resize_type of RELAY_RESIZE_REPLACED, to inform the
+           client that the replacement is complete and additionally
+           confirming the current sub-buffer size and number of
+           sub-buffers.  Note that a resize can be canceled if
+           relay_realloc_buffer() is called with the async param
+           non-zero and the resize conditions no longer hold.  In
+           this case, the RELAY_RESIZE_REPLACED suggested number of
+           sub-buffers will be the same as the number of sub-buffers
+           that existed before the RELAY_RESIZE_SHRINK or EXPAND i.e.
+           values indicating that the resize didn't actually occur.
+
+         - int fileop_notify(channel_id, struct file *filp, enum relay_fileop)
+
+           called when a userspace file operation has occurred or
+           will occur on a relayfs channel file.  These notifications
+           can be used by the kernel client to trigger actions within
+           the kernel client when the corresponding event occurs,
+           such as enabling logging only when a userspace application
+           opens or mmaps a relayfs file and disabling it again when
+           the file is closed or unmapped.  The kernel client can
+           also return its own return value, which can affect the
+           outcome of file operation - returning 0 indicates that the
+           operation should succeed, and returning a negative value
+           indicates that the operation should be failed, and that
+           the returned value should be returned to the ultimate
+           caller e.g. returning -EPERM from the open fileop will
+           cause the open to fail with -EPERM.  Among other things,
+           the return value can be used to restrict a relayfs file
+           from being opened or mmap'ed more than once.  The currently
+           implemented fileops are:
+
+           RELAY_FILE_OPEN - a relayfs file is being opened.  Return
+                             0 to allow it to succeed, negative to
+                             have it fail.  A negative return value will
+                             be passed on unmodified to the open fileop.
+           RELAY_FILE_CLOSE- a relayfs file is being closed.  The return
+                             value is ignored.
+           RELAY_FILE_MAP - a relayfs file is being mmap'ed.  Return 0
+                            to allow it to succeed, negative to have
+                            it fail.  A negative return value will be
+                            passed on unmodified to the mmap fileop.
+           RELAY_FILE_UNMAP- a relayfs file is being unmapped.  The return
+                             value is ignored.
+
+         - void ioctl(rchan_id, cmd, arg)
+
+           called when an ioctl call is made using a relayfs file
+           descriptor.  The cmd and arg are passed along to this
+           callback unmodified for it to do as it wishes with.  The
+           return value from this callback is used as the return value
+           of the ioctl call.
+
+  If the callbacks param passed to relay_open() is NULL, a set of
+  default do-nothing callbacks will be defined for the channel.
+  Likewise, any NULL rchan_callback function contained in a non-NULL
+  callbacks struct will be filled in with a default callback function
+  that does nothing.
+
+- start_reserve - the number of bytes to be reserved at the start of
+  each sub-buffer.  The client can do what it wants with this number
+  of bytes when the buffer_start() callback is invoked.  Typically
+  clients would use this to write per-sub-buffer header data.
+
+- end_reserve - the number of bytes to be reserved at the end of each
+  sub-buffer.  The client can do what it wants with this number of
+  bytes when the buffer_end() callback is invoked.  Typically clients
+  would use this to write per-sub-buffer footer data.
+
+- channel_start_reserve - the number of bytes to be reserved, in
+  addition to start_reserve, at the beginning of the first sub-buffer
+  in the channel.  The client can do what it wants with this number of
+  bytes when the buffer_start() callback is invoked.  Typically
+  clients would use this to write per-channel header data.
+
+- resize_min - if set, this signifies that the channel is
+  auto-resizeable.  The value specifies the size that the channel will
+  try to maintain as a normal working size, and that it won't go
+  below.  The client makes use of the resizing callbacks and
+  relay_realloc_buffer() and relay_replace_buffer() to actually effect
+  the resize.
+
+- resize_max - if set, this signifies that the channel is
+  auto-resizeable.  The value specifies the maximum size the channel
+  can have as a result of resizing.
+
+- mode - if non-zero, specifies the file permissions that will be given
+  to the channel file.  If 0, the default rw user perms will be used.
+
+- init_buf - if non-NULL, rather than allocating the channel buffer,
+  this buffer will be used as the initial channel buffer.  The kernel
+  API function relay_discard_init_buf() can later be used to have
+  relayfs allocate a normal mmappable channel buffer and switch over
+  to using it after copying the init_buf contents into it.  Currently,
+  the size of init_buf must be exactly buf_size * n_bufs.  The caller
+  is responsible for managing the init_buf memory.  This feature is
+  typically used for init-time channel use and should normally be
+  specified as NULL.
+
+- init_buf_size - the total size of init_buf, if init_buf is specified
+  as non-NULL.  Currently, the size of init_buf must be exactly
+  buf_size * n_bufs.
+
+Upon successful completion, relay_open() returns a channel id
+to be used for all other operations with the relay. All buffers
+managed by the relay are allocated using rvmalloc/rvfree to allow
+for easy mmapping to user-space.
+
+----------
+int relay_write(channel_id, *data_ptr, count, time_delta_offset, **wrote_pos)
+
+relay_write() reserves space in the channel and writes count bytes of
+data pointed to by data_ptr to it.  Automatically performs any
+necessary locking, depending on the scheme and SMP usage in effect (no
+locking is done for the lockless scheme regardless of usage).  It
+returns the number of bytes written, or 0/negative on failure.  If
+time_delta_offset is >= 0, the internal time delta, the internal time
+delta calculated when the slot was reserved will be written at that
+offset.  This is the TSC or gettimeofday() delta between the current
+write and the beginning of the buffer, whichever method is being used
+by the channel.  Trying to write a count larger than the bufsize
+specified to relay_open() (taking into account the reserved
+start-of-buffer and end-of-buffer space as well) will fail.  If
+wrote_pos is non-NULL, it will receive the location the data was
+written to, which may be needed for some applications but is not
+normally interesting.  Most applications should pass in NULL for this
+param.
+
+----------
+struct rchan_reader *add_rchan_reader(int rchan_id, int auto_consume)
+
+add_rchan_reader creates and initializes a reader object for a
+channel.  An opaque rchan_reader object is returned on success, and is
+passed to relay_read() when reading the channel.  If the boolean
+auto_consume parameter is 1, the reader is defined to be
+auto-consuming.  auto-consuming reader objects are automatically
+created and used for VFS read(2) readers.
+
+----------
+void remove_rchan_reader(struct rchan_reader *reader)
+
+remove_rchan_reader finds and removes the given reader from the
+channel.  This function is used only by non-VFS read(2) readers.  VFS
+read(2) readers are automatically removed when the corresponding file
+object is closed.
+
+----------
+reader add_map_reader(int rchan_id)
+
+Creates and initializes an rchan_reader object for channel map
+readers, and is needed for updating relay_bytes/buffers_consumed()
+when kernel clients become aware of the need to do so by their mmap
+user clients.
+
+----------
+int remove_map_reader(reader)
+
+Finds and removes the given map reader from the channel.  This function
+is useful only for map readers.
+
+----------
+int relay_read(reader, buf, count, wait, *actual_read_offset)
+
+Reads count bytes from the channel, or as much as is available within
+the sub-buffer currently being read.  The read offset that will be
+read from is the position contained within the reader object.  If the
+wait flag is set, buf is non-NULL, and there is nothing available, it
+will wait until there is.  If the wait flag is 0 and there is nothing
+available, -EAGAIN is returned.  If buf is NULL, the value returned is
+the number of bytes that would have been read.  actual_read_offset is
+the value that should be passed as the read offset to
+relay_bytes_consumed, needed only if the reader is not auto-consuming
+and the channel is MODE_NO_OVERWRITE, but in any case, it must not be
+NULL.
+
+---------- 
+
+int relay_bytes_avail(reader)
+
+Returns the number of bytes available relative to the reader's current
+read position within the corresponding sub-buffer, 0 if there is
+nothing available.  Note that this doesn't return the total bytes
+available in the channel buffer - this is enough though to know if
+anything is available, however, or how many bytes might be returned
+from the next read.
+
+----------
+void relay_buffers_consumed(reader, buffers_consumed)
+
+Adds to the channel's consumed buffer count.  buffers_consumed should
+be the number of buffers newly consumed, not the total number
+consumed.  NOTE: kernel clients don't need to call this function if
+the reader is auto-consuming or the channel is MODE_CONTINUOUS.
+
+In order for the relay to detect the 'buffers full' condition for a
+channel, it must be kept up-to-date with respect to the number of
+buffers consumed by the client.  If the addition of the value of the
+bufs_consumed param to the current bufs_consumed count for the channel
+would exceed the bufs_produced count for the channel, the channel's
+bufs_consumed count will be set to the bufs_produced count for the
+channel.  This allows clients to 'catch up' if necessary.
+
+----------
+void relay_bytes_consumed(reader, bytes_consumed, read_offset)
+
+Adds to the channel's consumed count.  bytes_consumed should be the
+number of bytes actually read e.g. return value of relay_read() and
+the read_offset should be the actual offset the bytes were read from
+e.g. the actual_read_offset set by relay_read().  NOTE: kernel clients
+don't need to call this function if the reader is auto-consuming or
+the channel is MODE_CONTINUOUS.
+
+In order for the relay to detect the 'buffers full' condition for a
+channel, it must be kept up-to-date with respect to the number of
+bytes consumed by the client.  For packet clients, it makes more sense
+to update after each read rather than after each complete sub-buffer
+read.  The bytes_consumed count updates bufs_consumed when a buffer
+has been consumed so this count remains consistent.
+
+----------
+int relay_info(channel_id, *channel_info)
+
+relay_info() fills in an rchan_info struct with channel status and
+attribute information such as usage modes, sub-buffer size and count,
+the allocated size of the entire buffer, buffers produced and
+consumed, current buffer id, count of writes lost due to buffers full
+condition.
+
+The virtual address of the channel buffer is also available here, for
+those clients that need it.
+
+Clients may need to know how many 'unused' bytes there are at the end
+of a given sub-buffer.  This would only be the case if the client 1)
+didn't either write this count to the end of the sub-buffer or
+otherwise note it (it's available as the difference between the buffer
+end and current write pos params in the buffer_end callback) (if the
+client returned 0 from the buffer_end callback, it's assumed that this
+is indeed the case) 2) isn't using the read() system call to read the
+buffer.  In other words, if the client isn't annotating the stream and
+is reading the buffer by mmaping it, this information would be needed
+in order for the client to 'skip over' the unused bytes at the ends of
+sub-buffers.
+
+Additionally, for the lockless scheme, clients may need to know
+whether a particular sub-buffer is actually complete.  An array of
+boolean values, one per sub-buffer, contains non-zero if the buffer is
+complete, non-zero otherwise.
+
+----------
+int relay_close(channel_id)
+
+relay_close() is used to close the channel.  It finalizes the last
+sub-buffer (the one currently being written to) and marks the channel
+as finalized.  The channel buffer and channel data structure are then
+freed automatically when the last reference to the channel is given
+up.
+
+----------
+int relay_realloc_buffer(channel_id, nbufs, async)
+
+Allocates a new channel buffer using the specified sub-buffer count
+(note that resizing can't change sub-buffer sizes).  If async is
+non-zero, the allocation is done in the background using a work queue.
+When the allocation has completed, the needs_resize() callback is
+called with a resize_type of RELAY_RESIZE_REPLACE.  This function
+doesn't replace the old buffer with the new - see
+relay_replace_buffer().
+
+This function is called by kernel clients in response to a
+needs_resize() callback call with a resize type of RELAY_RESIZE_EXPAND
+or RELAY_RESIZE_SHRINK.  That callback also includes a suggested
+new_bufsize and new_nbufs which should be used when calling this
+function.
+
+Returns 0 on success, or errcode if the channel is busy or if
+the allocation couldn't happen for some reason.
+
+NOTE: if async is not set, this function should not be called with a
+lock held, as it may sleep.
+
+----------
+int relay_replace_buffer(channel_id)
+
+Replaces the current channel buffer with the new buffer allocated by
+relay_realloc_buffer and contained in the channel struct.  When the
+replacement is complete, the needs_resize() callback is called with
+RELAY_RESIZE_REPLACED.  This function is called by kernel clients in
+response to a needs_resize() callback having a resize type of
+RELAY_RESIZE_REPLACE.
+
+Returns 0 on success, or errcode if the channel is busy or if the
+replacement or previous allocation didn't happen for some reason.
+
+NOTE: This function will not sleep, so can called in any context and
+with locks held.  The client should, however, ensure that the channel
+isn't actively being read from or written to.
+
+----------
+int relay_reset(rchan_id)
+
+relay_reset() has the effect of erasing all data from the buffer and
+restarting the channel in its initial state.  The buffer itself is not
+freed, so any mappings are still in effect.  NOTE: Care should be
+taken that the channnel isn't actually being used by anything when
+this call is made.
+
+----------
+int rchan_full(reader)
+
+returns 1 if the channel is full with respect to the reader, 0 if not.
+
+----------
+int rchan_empty(reader)
+
+returns 1 if the channel is empty with respect to the reader, 0 if not.
+
+----------
+int relay_discard_init_buf(rchan_id)
+
+allocates an mmappable channel buffer, copies the contents of init_buf
+into it, and sets the current channel buffer to the newly allocated
+buffer.  This function is used only in conjunction with the init_buf
+and init_buf_size params to relay_open(), and is typically used when
+the ability to write into the channel at init-time is needed.  The
+basic usage is to specify an init_buf and init_buf_size to relay_open,
+then call this function when it's safe to switch over to a normally
+allocated channel buffer.  'Safe' means that the caller is in a
+context that can sleep and that nothing is actively writing to the
+channel.  Returns 0 if successful, negative otherwise.
+
+
+Writing directly into the channel
+=================================
+
+Using the relay_write() API function as described above is the
+preferred means of writing into a channel.  In some cases, however,
+in-kernel clients might want to write directly into a relay channel
+rather than have relay_write() copy it into the buffer on the client's
+behalf.  Clients wishing to do this should follow the model used to
+implement relay_write itself.  The general sequence is:
+
+- get a pointer to the channel via rchan_get().  This increments the
+  channel's reference count.
+- call relay_lock_channel().  This will perform the proper locking for
+  the channel given the scheme in use and the SMP usage.
+- reserve a slot in the channel via relay_reserve()
+- write directly to the reserved address
+- call relay_commit() to commit the write
+- call relay_unlock_channel()
+- call rchan_put() to release the channel reference
+
+In particular, clients should make sure they call rchan_get() and
+rchan_put() and not hold on to references to the channel pointer.
+Also, forgetting to use relay_lock_channel()/relay_unlock_channel()
+has no effect if the lockless scheme is being used, but could result
+in corrupted buffer contents if the locking scheme is used.
+
+
+Limitations
+===========
+
+Writes made via the write() system call are currently limited to 2
+pages worth of data.  There is no such limit on the in-kernel API
+function relay_write().
+
+User applications can currently only mmap the complete buffer (it
+doesn't really make sense to mmap only part of it, given its purpose).
+
+
+Latest version
+==============
+
+The latest version can be found at:
+
+http://www.opersys.com/relayfs
+
+Example relayfs clients, such as dynamic printk and the Linux Trace
+Toolkit, can also be found there.
+
+
+Credits
+=======
+
+The ideas and specs for relayfs came about as a result of discussions
+on tracing involving the following:
+
+Michel Dagenais                <michel.dagenais@polymtl.ca>
+Richard Moore          <richardj_moore@uk.ibm.com>
+Bob Wisniewski         <bob@watson.ibm.com>
+Karim Yaghmour         <karim@opersys.com>
+Tom Zanussi            <zanussi@us.ibm.com>
+
+Also thanks to Hubertus Franke for a lot of useful suggestions and bug
+reports, and for contributing the klog code.
diff --git a/fs/rcfs/Makefile b/fs/rcfs/Makefile
new file mode 100644 (file)
index 0000000..2957522
--- /dev/null
@@ -0,0 +1,10 @@
+#
+# Makefile for rcfs routines.
+#
+
+obj-$(CONFIG_RCFS_FS) += rcfs.o
+
+rcfs-objs := super.o inode.o dir.o rootdir.o magic.o tc_magic.o socket_fs.o 
+
+rcfs-objs-$(CONFIG_CKRM_TYPE_TASKCLASS) += tc_magic.o
+rcfs-objs-$(CONFIG_CKRM_TYPE_SOCKETCLASS) += socket_fs.o
diff --git a/fs/rcfs/dir.c b/fs/rcfs/dir.c
new file mode 100644 (file)
index 0000000..048fe09
--- /dev/null
@@ -0,0 +1,336 @@
+/* 
+ * fs/rcfs/dir.c 
+ *
+ * Copyright (C) Shailabh Nagar,  IBM Corp. 2004
+ *               Vivek Kashyap,   IBM Corp. 2004
+ *           
+ * 
+ * Directory operations for rcfs
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Changes
+ *
+ * 08 Mar 2004
+ *        Created.
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <asm/namei.h>
+#include <linux/namespace.h>
+#include <linux/dcache.h>
+#include <linux/seq_file.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/parser.h>
+
+#include <asm/uaccess.h>
+
+#include <linux/rcfs.h>
+
+
+
+#define rcfs_positive(dentry)  ((dentry)->d_inode && !d_unhashed((dentry)))
+
+int rcfs_empty(struct dentry *dentry)
+{
+        struct dentry *child;
+        int ret = 0;
+                                                                                               
+        spin_lock(&dcache_lock);
+        list_for_each_entry(child, &dentry->d_subdirs, d_child)
+                if (!rcfs_is_magic(child) && rcfs_positive(child))
+                        goto out;
+        ret = 1;
+out:
+        spin_unlock(&dcache_lock);
+        return ret;
+}
+
+                                                                                               
+
+
+/* Directory inode operations */
+
+
+int 
+rcfs_create(struct inode *dir, struct dentry *dentry, int mode, 
+           struct nameidata *nd)
+{
+       return rcfs_mknod(dir, dentry, mode | S_IFREG, 0);
+}
+EXPORT_SYMBOL(rcfs_create);
+
+
+/* Symlinks permitted ?? */
+int  
+rcfs_symlink(struct inode * dir, struct dentry *dentry, const char * symname)
+{
+       struct inode *inode;
+       int error = -ENOSPC;
+
+       inode = rcfs_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0);
+       if (inode) {
+               int l = strlen(symname)+1;
+               error = page_symlink(inode, symname, l);
+               if (!error) {
+                       if (dir->i_mode & S_ISGID)
+                               inode->i_gid = dir->i_gid;
+                       d_instantiate(dentry, inode);
+                       dget(dentry);
+               } else
+                       iput(inode);
+       }
+       return error;
+}
+EXPORT_SYMBOL(rcfs_symlink);
+
+int
+rcfs_create_coredir(struct inode *dir, struct dentry *dentry)
+{
+
+       struct rcfs_inode_info *ripar, *ridir;
+       int sz;
+
+       ripar = RCFS_I(dir);
+       ridir = RCFS_I(dentry->d_inode);
+
+       // Inform RC's - do Core operations 
+       if (ckrm_is_core_valid(ripar->core)) {
+               sz = strlen(ripar->name) + strlen(dentry->d_name.name) + 2 ;
+               ridir->name = kmalloc(sz, GFP_KERNEL);
+               if (!ridir->name) {
+                       return -ENOMEM;
+               }
+               snprintf(ridir->name, sz,"%s/%s", ripar->name, 
+                        dentry->d_name.name);
+               ridir->core = (*(ripar->core->classtype->alloc))
+                       (ripar->core,ridir->name);
+       }
+       else {
+               printk(KERN_ERR "rcfs_mkdir: Invalid parent core %p\n",
+                      ripar->core);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(rcfs_create_coredir);
+
+
+int
+rcfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+
+       int retval = 0;
+       ckrm_classtype_t *clstype;
+
+#if 0
+       struct dentry *pd = list_entry(dir->i_dentry.next, struct dentry, 
+                                                       d_alias);
+       if ((!strcmp(pd->d_name.name, "/") &&
+            !strcmp(dentry->d_name.name, "ce"))) {
+               // Call CE's mkdir if it has registered, else fail.
+               if (rcfs_eng_callbacks.mkdir) {
+                       return (*rcfs_eng_callbacks.mkdir)(dir, dentry, mode);
+               } else {
+                       return -EINVAL;
+               }
+       }
+#endif
+
+       if (_rcfs_mknod(dir, dentry, mode | S_IFDIR, 0)) {
+               printk(KERN_ERR "rcfs_mkdir: error in _rcfs_mknod\n");
+               return retval;
+       }
+
+       dir->i_nlink++;
+
+       // Inherit parent's ops since _rcfs_mknod assigns noperm ops
+       dentry->d_inode->i_op = dir->i_op;
+       dentry->d_inode->i_fop = dir->i_fop;
+
+
+       retval = rcfs_create_coredir(dir, dentry);
+       if (retval) {
+               simple_rmdir(dir,dentry);
+               return retval;
+                // goto mkdir_err;
+       }
+       // create the default set of magic files 
+       clstype = (RCFS_I(dentry->d_inode))->core->classtype;
+       rcfs_create_magic(dentry, &(((struct rcfs_magf*)clstype->mfdesc)[1]), 
+                         clstype->mfcount-1);
+
+       return retval;
+
+//mkdir_err:
+       dir->i_nlink--;
+       return retval;
+}
+EXPORT_SYMBOL(rcfs_mkdir);
+
+
+int 
+rcfs_rmdir(struct inode * dir, struct dentry * dentry)
+{
+       struct rcfs_inode_info *ri = RCFS_I(dentry->d_inode);
+
+#if 0
+       struct dentry *pd = list_entry(dir->i_dentry.next, 
+                                      struct dentry, d_alias);
+       if ((!strcmp(pd->d_name.name, "/") &&
+            !strcmp(dentry->d_name.name, "ce"))) {
+               // Call CE's mkdir if it has registered, else fail.
+               if (rcfs_eng_callbacks.rmdir) {
+                       return (*rcfs_eng_callbacks.rmdir)(dir, dentry);
+               } else {
+                       return simple_rmdir(dir, dentry);
+               }
+       }
+       else if ((!strcmp(pd->d_name.name, "/") &&
+                 !strcmp(dentry->d_name.name, "network"))) {
+               return -EPERM;
+       }
+#endif
+       
+       if (!rcfs_empty(dentry)) {
+               printk(KERN_ERR "rcfs_rmdir: directory not empty\n");
+               goto out;
+       }
+
+       // Core class removal 
+
+       if (ri->core == NULL) {
+               printk(KERN_ERR "rcfs_rmdir: core==NULL\n");
+               // likely a race condition
+               return 0;
+       }
+
+       if ((*(ri->core->classtype->free))(ri->core)) {
+               printk(KERN_ERR "rcfs_rmdir: ckrm_free_core_class failed\n");
+               goto out;
+       }
+       ri->core = NULL ; // just to be safe 
+
+       // Clear magic files only after core successfully removed 
+       rcfs_clear_magic(dentry);
+
+       return simple_rmdir(dir, dentry);
+
+out:
+       return -EBUSY;
+}
+EXPORT_SYMBOL(rcfs_rmdir);
+
+
+int
+rcfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+       // -ENOENT and not -ENOPERM to allow rm -rf to work despite 
+       // magic files being present
+       return -ENOENT;
+}
+EXPORT_SYMBOL(rcfs_unlink);
+       
+// rename is allowed on directories only
+int
+rcfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+               struct inode *new_dir, struct dentry *new_dentry)
+{
+       if (S_ISDIR(old_dentry->d_inode->i_mode)) 
+               return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+       else
+               return -EINVAL;
+}
+EXPORT_SYMBOL(rcfs_rename);
+
+
+struct inode_operations rcfs_dir_inode_operations = {
+       .create         = rcfs_create,
+       .lookup         = simple_lookup,
+       .link           = simple_link,
+       .unlink         = rcfs_unlink,
+       .symlink        = rcfs_symlink,
+       .mkdir          = rcfs_mkdir,
+       .rmdir          = rcfs_rmdir,
+       .mknod          = rcfs_mknod,
+       .rename         = rcfs_rename,
+};
+
+
+
+
+
+int 
+rcfs_root_create(struct inode *dir, struct dentry *dentry, int mode, 
+                struct nameidata *nd)
+{
+       return -EPERM;
+}
+
+
+int  
+rcfs_root_symlink(struct inode * dir, struct dentry *dentry, 
+                 const char * symname)
+{
+       return -EPERM;
+}
+
+int 
+rcfs_root_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+       return -EPERM;
+}
+
+int 
+rcfs_root_rmdir(struct inode * dir, struct dentry * dentry)
+{
+       return -EPERM;
+}
+
+int
+rcfs_root_unlink(struct inode *dir, struct dentry *dentry)
+{
+       return -EPERM;
+}
+
+int
+rcfs_root_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+{
+       return -EPERM;
+}
+       
+int
+rcfs_root_rename(struct inode *old_dir, struct dentry *old_dentry,
+               struct inode *new_dir, struct dentry *new_dentry)
+{
+       return -EPERM;
+}
+
+struct inode_operations rcfs_rootdir_inode_operations = {
+       .create         = rcfs_root_create,
+       .lookup         = simple_lookup,
+       .link           = simple_link,
+       .unlink         = rcfs_root_unlink,
+       .symlink        = rcfs_root_symlink,
+       .mkdir          = rcfs_root_mkdir,
+       .rmdir          = rcfs_root_rmdir,
+       .mknod          = rcfs_root_mknod,
+       .rename         = rcfs_root_rename,
+};
diff --git a/fs/rcfs/inode.c b/fs/rcfs/inode.c
new file mode 100644 (file)
index 0000000..d9be673
--- /dev/null
@@ -0,0 +1,204 @@
+/* 
+ * fs/rcfs/inode.c 
+ *
+ * Copyright (C) Shailabh Nagar,  IBM Corp. 2004
+ *               Vivek Kashyap,   IBM Corp. 2004
+ *           
+ * 
+ * Resource class filesystem (rcfs) forming the 
+ * user interface to Class-based Kernel Resource Management (CKRM).
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Changes
+ *
+ * 05 Mar 2004
+ *        Created.
+ * 06 Mar 2004
+ *        Parsing for shares added
+ */
+
+
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <asm/namei.h>
+#include <linux/namespace.h>
+#include <linux/dcache.h>
+#include <linux/seq_file.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/parser.h>
+#include <asm/uaccess.h>
+
+#include <linux/rcfs.h>
+
+
+
+// Address of variable used as flag to indicate a magic file, 
+// ; value unimportant 
+int RCFS_IS_MAGIC;
+
+
+struct inode *rcfs_get_inode(struct super_block *sb, int mode, dev_t dev)
+{
+       struct inode * inode = new_inode(sb);
+
+       if (inode) {
+               inode->i_mode = mode;
+               inode->i_uid = current->fsuid;
+               inode->i_gid = current->fsgid;
+               inode->i_blksize = PAGE_CACHE_SIZE;
+               inode->i_blocks = 0;
+               inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+               switch (mode & S_IFMT) {
+               default:
+                       init_special_inode(inode, mode, dev);
+                       break;
+               case S_IFREG:
+                       // Treat as default assignment */
+                       inode->i_op = &rcfs_file_inode_operations;
+                       // inode->i_fop = &rcfs_file_operations;
+                       break;
+               case S_IFDIR:
+                       // inode->i_op = &rcfs_dir_inode_operations;
+                       inode->i_op = &rcfs_rootdir_inode_operations;
+                       inode->i_fop = &simple_dir_operations;
+
+                       // directory inodes start off with i_nlink == 2 
+                       //  (for "." entry)
+                       inode->i_nlink++;
+                       break;
+               case S_IFLNK:
+                       inode->i_op = &page_symlink_inode_operations;
+                       break;
+               }
+       }
+       return inode;
+}
+
+
+
+int
+_rcfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+{
+       struct inode *inode;
+       int error = -EPERM;
+
+       if (dentry->d_inode)
+               return -EEXIST;
+
+       inode = rcfs_get_inode(dir->i_sb, mode, dev);
+       if (inode) {
+               if (dir->i_mode & S_ISGID) {
+                       inode->i_gid = dir->i_gid;
+                       if (S_ISDIR(mode))
+                               inode->i_mode |= S_ISGID;
+               }
+               d_instantiate(dentry, inode);
+               dget(dentry);   
+               error = 0;
+       }
+
+       return error;
+}
+EXPORT_SYMBOL(_rcfs_mknod);
+
+
+int
+rcfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+{
+       // User can only create directories, not files
+       if ((mode & S_IFMT) != S_IFDIR)
+               return -EINVAL;
+
+       return  dir->i_op->mkdir(dir, dentry, mode);
+}
+EXPORT_SYMBOL(rcfs_mknod);
+
+
+struct dentry * 
+rcfs_create_internal(struct dentry *parent, struct rcfs_magf *magf, int magic)
+{
+       struct qstr qstr;
+       struct dentry *mfdentry ;
+
+       // Get new dentry for name  
+       qstr.name = magf->name;
+       qstr.len = strlen(magf->name);
+       qstr.hash = full_name_hash(magf->name,qstr.len);
+       mfdentry = lookup_hash(&qstr,parent);
+
+       if (!IS_ERR(mfdentry)) {
+               int err; 
+
+               down(&parent->d_inode->i_sem);
+               if (magic && (magf->mode & S_IFDIR))
+                       err = parent->d_inode->i_op->mkdir(parent->d_inode,
+                                                  mfdentry, magf->mode);
+               else {
+                       err =_rcfs_mknod(parent->d_inode,mfdentry,
+                                        magf->mode,0);
+                       // _rcfs_mknod doesn't increment parent's link count, 
+                       // i_op->mkdir does.
+                       parent->d_inode->i_nlink++;
+               }
+               up(&parent->d_inode->i_sem);
+
+               if (err) {
+                       dput(mfdentry);
+                       return mfdentry;
+               }
+       }
+       return mfdentry ;
+}
+EXPORT_SYMBOL(rcfs_create_internal);
+
+int 
+rcfs_delete_internal(struct dentry *mfdentry)
+{
+       struct dentry *parent ;
+
+       if (!mfdentry || !mfdentry->d_parent)
+               return -EINVAL;
+       
+       parent = mfdentry->d_parent;
+
+       if (!mfdentry->d_inode) {
+               return 0;
+       }
+       down(&mfdentry->d_inode->i_sem);
+       if (S_ISDIR(mfdentry->d_inode->i_mode))
+               simple_rmdir(parent->d_inode, mfdentry);
+       else
+               simple_unlink(parent->d_inode, mfdentry);
+       up(&mfdentry->d_inode->i_sem);
+
+       d_delete(mfdentry);
+
+       return 0;
+}
+EXPORT_SYMBOL(rcfs_delete_internal);
+
+struct inode_operations rcfs_file_inode_operations = {
+       .getattr        = simple_getattr,
+};
+               
+
+
+
+
+
diff --git a/fs/rcfs/magic.c b/fs/rcfs/magic.c
new file mode 100644 (file)
index 0000000..ad92a07
--- /dev/null
@@ -0,0 +1,546 @@
+/* 
+ * fs/rcfs/magic.c 
+ *
+ * Copyright (C) Shailabh Nagar,      IBM Corp. 2004
+ *           (C) Vivek Kashyap,       IBM Corp. 2004
+ *           (C) Chandra Seetharaman, IBM Corp. 2004
+ *           (C) Hubertus Franke,     IBM Corp. 2004
+ * 
+ * File operations for common magic files in rcfs, 
+ * the user interface for CKRM. 
+ * 
+ * 
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Changes
+ *
+ * 23 Apr 2004
+ *        Created from code kept earlier in fs/rcfs/magic_*.c
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <asm/namei.h>
+#include <linux/namespace.h>
+#include <linux/dcache.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/parser.h>
+#include <asm/uaccess.h>
+
+#include <linux/rcfs.h>
+
+
+
+
+/******************************************************
+ * Macros
+ *
+ * generic macros to assist in writing magic fileops
+ *
+ *****************************************************/
+
+
+#define MAGIC_SHOW(FUNC)                                               \
+static int                                                             \
+FUNC ## _show(struct seq_file *s, void *v)                            \
+{                                                                     \
+       int rc=0;                                                      \
+       ckrm_core_class_t *core ;                                      \
+                                                                      \
+       core = (ckrm_core_class_t *)                                   \
+               (((struct rcfs_inode_info *)s->private)->core);        \
+                                                                      \
+       if (!ckrm_is_core_valid(core)) {                               \
+               return -EINVAL;                                        \
+        }                                                              \
+                                                                       \
+       if (core->classtype->show_ ## FUNC)                            \
+               rc = (* core->classtype->show_ ## FUNC)(core, s);      \
+                                                                      \
+       return rc;                                                     \
+};                                                                      
+
+#define MAGIC_OPEN(FUNC)                                               \
+static int                                                             \
+FUNC ## _open(struct inode *inode, struct file *file)                  \
+{                                                                      \
+       struct rcfs_inode_info *ri;                                    \
+       int ret=-EINVAL;                                               \
+                                                                      \
+       if (file->f_dentry && file->f_dentry->d_parent) {              \
+                                                                      \
+               ri = RCFS_I(file->f_dentry->d_parent->d_inode);        \
+               ret = single_open(file,FUNC ## _show, (void *)ri);     \
+       }                                                              \
+       return ret;                                                    \
+}                                                                     
+                                                                      
+#define MAGIC_CLOSE(FUNC)                                              \
+static int                                                             \
+FUNC ## _close(struct inode *inode, struct file *file)                \
+{                                                                     \
+       return single_release(inode,file);                             \
+}
+                                                                      
+
+
+#define MAGIC_PARSE(FUNC)                                              \
+static int                                                             \
+FUNC ## _parse(char *options, char **resstr, char **otherstr)         \
+{                                                                     \
+       char *p;                                                       \
+                                                                      \
+       if (!options)                                                  \
+               return 1;                                              \
+                                                                      \
+       while ((p = strsep(&options, ",")) != NULL) {                  \
+               substring_t args[MAX_OPT_ARGS];                        \
+               int token;                                             \
+                                                                      \
+               if (!*p)                                               \
+                       continue;                                      \
+                                                                      \
+               token = match_token(p, FUNC##_tokens, args);           \
+               switch (token) {                                       \
+               case FUNC ## _res_type:                                \
+                       *resstr = match_strdup(args);                  \
+                       break;                                         \
+               case FUNC ## _str:                                     \
+                       *otherstr = match_strdup(args);                \
+                       break;                                         \
+               default:                                               \
+                       return 0;                                      \
+               }                                                      \
+       }                                                              \
+       return 1;                                                      \
+}
+
+#define MAGIC_WRITE(FUNC,CLSTYPEFUN)                                   \
+static ssize_t                                                         \
+FUNC ## _write(struct file *file, const char __user *buf,             \
+                          size_t count, loff_t *ppos)                 \
+{                                                                     \
+       struct rcfs_inode_info *ri =                                   \
+               RCFS_I(file->f_dentry->d_parent->d_inode);             \
+       char *optbuf, *otherstr=NULL, *resname=NULL;                   \
+       int done, rc = 0;                                              \
+       ckrm_core_class_t *core ;                                      \
+                                                                      \
+       core = ri->core;                                               \
+       if (!ckrm_is_core_valid(core))                                 \
+               return -EINVAL;                                        \
+                                                                      \
+       if ((ssize_t) count < 0                                        \
+           || (ssize_t) count > FUNC ## _max_input_size)              \
+               return -EINVAL;                                        \
+                                                                      \
+       if (!access_ok(VERIFY_READ, buf, count))                       \
+               return -EFAULT;                                        \
+                                                                      \
+       down(&(ri->vfs_inode.i_sem));                                  \
+                                                                      \
+       optbuf = kmalloc(FUNC ## _max_input_size, GFP_KERNEL);         \
+       __copy_from_user(optbuf, buf, count);                          \
+       if (optbuf[count-1] == '\n')                                   \
+               optbuf[count-1]='\0';                                  \
+                                                                      \
+       done = FUNC ## _parse(optbuf, &resname, &otherstr);            \
+                                                                      \
+       if (!done) {                                                   \
+               printk(KERN_ERR "Error parsing FUNC \n");              \
+               goto FUNC ## _write_out;                               \
+       }                                                              \
+                                                                      \
+       if (core->classtype-> CLSTYPEFUN) {                            \
+               rc = (*core->classtype->CLSTYPEFUN)                    \
+                       (core, resname, otherstr);                     \
+               if (rc) {                                              \
+                       printk(KERN_ERR "FUNC_write: CLSTYPEFUN error\n");   \
+                       goto FUNC ## _write_out;                       \
+               }                                                      \
+       }                                                              \
+                                                                      \
+FUNC ## _write_out:                                                   \
+       up(&(ri->vfs_inode.i_sem));                                    \
+       kfree(optbuf);                                                 \
+       kfree(otherstr);                                               \
+       kfree(resname);                                                \
+       return rc ? rc : count;                                        \
+}
+                                                                      
+                                                                      
+#define MAGIC_RD_FILEOPS(FUNC)                                         \
+struct file_operations FUNC ## _fileops = {                            \
+       .open           = FUNC ## _open,                               \
+       .read           = seq_read,                                    \
+       .llseek         = seq_lseek,                                   \
+       .release        = FUNC ## _close,                              \
+};                                                                     \
+EXPORT_SYMBOL(FUNC ## _fileops);
+
+                                                                      
+#define MAGIC_RDWR_FILEOPS(FUNC)                                       \
+struct file_operations FUNC ## _fileops = {                            \
+       .open           = FUNC ## _open,                               \
+       .read           = seq_read,                                    \
+       .llseek         = seq_lseek,                                   \
+       .release        = FUNC ## _close,                              \
+       .write          = FUNC ## _write,                              \
+};                                                                     \
+EXPORT_SYMBOL(FUNC ## _fileops);
+
+
+/********************************************************************************
+ * Target
+ *
+ * pseudo file for manually reclassifying members to a class
+ *
+ *******************************************************************************/
+
+#define TARGET_MAX_INPUT_SIZE 100
+
+static ssize_t
+target_write(struct file *file, const char __user *buf,
+                          size_t count, loff_t *ppos)
+{
+       struct rcfs_inode_info *ri= RCFS_I(file->f_dentry->d_inode);
+       char *optbuf;
+       int rc = -EINVAL;
+       ckrm_classtype_t *clstype;
+
+
+       if ((ssize_t) count < 0 || (ssize_t) count > TARGET_MAX_INPUT_SIZE)
+               return -EINVAL;
+       
+       if (!access_ok(VERIFY_READ, buf, count))
+               return -EFAULT;
+       
+       down(&(ri->vfs_inode.i_sem));
+       
+       optbuf = kmalloc(TARGET_MAX_INPUT_SIZE, GFP_KERNEL);
+       __copy_from_user(optbuf, buf, count);
+       if (optbuf[count-1] == '\n')
+               optbuf[count-1]='\0';
+
+       clstype = ri->core->classtype;
+       if (clstype->forced_reclassify)
+               rc = (* clstype->forced_reclassify)(ri->core,optbuf);
+
+       up(&(ri->vfs_inode.i_sem));
+       kfree(optbuf);
+       return !rc ? count : rc;
+
+}
+
+struct file_operations target_fileops = {
+       .write          = target_write,
+};
+EXPORT_SYMBOL(target_fileops);
+
+
+
+/********************************************************************************
+ * Config
+ *
+ * Set/get configuration parameters of a class. 
+ *
+ *******************************************************************************/
+
+/* Currently there are no per-class config parameters defined.
+ * Use existing code as a template
+ */
+                                                                      
+#define config_max_input_size  300
+
+enum config_token_t {
+         config_str, config_res_type, config_err
+};
+
+static match_table_t config_tokens = {
+       {config_res_type,"res=%s"},
+       {config_str, "config=%s"},
+        {config_err, NULL},
+};
+
+
+MAGIC_PARSE(config);
+MAGIC_WRITE(config,set_config);
+MAGIC_SHOW(config);
+MAGIC_OPEN(config);
+MAGIC_CLOSE(config);
+
+MAGIC_RDWR_FILEOPS(config);
+
+
+/********************************************************************************
+ * Members
+ *
+ * List members of a class
+ *
+ *******************************************************************************/
+
+MAGIC_SHOW(members);
+MAGIC_OPEN(members);
+MAGIC_CLOSE(members);
+
+MAGIC_RD_FILEOPS(members);
+
+
+/********************************************************************************
+ * Stats
+ *
+ * Get/reset class statistics
+ * No standard set of stats defined. Each resource controller chooses
+ * its own set of statistics to maintain and export.
+ *
+ *******************************************************************************/
+
+#define stats_max_input_size  50
+
+enum stats_token_t {
+         stats_res_type, stats_str,stats_err
+};
+
+static match_table_t stats_tokens = {
+       {stats_res_type,"res=%s"},
+       {stats_str, NULL},
+        {stats_err, NULL},
+};
+
+
+MAGIC_PARSE(stats);
+MAGIC_WRITE(stats,reset_stats);
+MAGIC_SHOW(stats);
+MAGIC_OPEN(stats);
+MAGIC_CLOSE(stats);
+
+MAGIC_RDWR_FILEOPS(stats);
+
+
+/********************************************************************************
+ * Shares
+ *
+ * Set/get shares of a taskclass.
+ * Share types and semantics are defined by rcfs and ckrm core 
+ * 
+ *******************************************************************************/
+
+
+#define SHARES_MAX_INPUT_SIZE  300
+
+/* The enums for the share types should match the indices expected by
+   array parameter to ckrm_set_resshare */
+
+/* Note only the first NUM_SHAREVAL enums correspond to share types,
+   the remaining ones are for token matching purposes */
+
+enum share_token_t {
+        MY_GUAR, MY_LIM, TOT_GUAR, MAX_LIM, SHARE_RES_TYPE, SHARE_ERR
+};
+
+/* Token matching for parsing input to this magic file */
+static match_table_t shares_tokens = {
+       {SHARE_RES_TYPE, "res=%s"},
+        {MY_GUAR, "guarantee=%d"},
+        {MY_LIM,  "limit=%d"},
+       {TOT_GUAR,"total_guarantee=%d"},
+       {MAX_LIM, "max_limit=%d"},
+        {SHARE_ERR, NULL}
+};
+
+
+static int
+shares_parse(char *options, char **resstr, struct ckrm_shares *shares)
+{
+       char *p;
+       int option;
+
+       if (!options)
+               return 1;
+       
+       while ((p = strsep(&options, ",")) != NULL) {
+               
+               substring_t args[MAX_OPT_ARGS];
+               int token;
+               
+               if (!*p)
+                       continue;
+
+               token = match_token(p, shares_tokens, args);
+               switch (token) {
+               case SHARE_RES_TYPE:
+                       *resstr = match_strdup(args);
+                       break;
+               case MY_GUAR:
+                       if (match_int(args, &option))
+                               return 0;
+                       shares->my_guarantee = option;
+                       break;
+               case MY_LIM:
+                       if (match_int(args, &option))
+                               return 0;
+                       shares->my_limit = option;
+                       break;
+               case TOT_GUAR:
+                       if (match_int(args, &option))
+                               return 0;
+                       shares->total_guarantee = option;
+                       break;
+               case MAX_LIM:
+                       if (match_int(args, &option))
+                               return 0;
+                       shares->max_limit = option;
+                       break;
+               default:
+                       return 0;
+               }
+
+       }
+       return 1;
+}      
+
+
+static ssize_t
+shares_write(struct file *file, const char __user *buf,
+                          size_t count, loff_t *ppos)
+{
+       struct inode *inode = file->f_dentry->d_inode;
+       struct rcfs_inode_info *ri;
+       char *optbuf;
+       int rc = 0;
+       struct ckrm_core_class *core;
+       int done;
+       char *resname;
+
+       struct ckrm_shares newshares = {
+               CKRM_SHARE_UNCHANGED,
+               CKRM_SHARE_UNCHANGED,
+               CKRM_SHARE_UNCHANGED,
+               CKRM_SHARE_UNCHANGED,
+               CKRM_SHARE_UNCHANGED,
+               CKRM_SHARE_UNCHANGED
+       };
+
+       if ((ssize_t) count < 0 || (ssize_t) count > SHARES_MAX_INPUT_SIZE)
+               return -EINVAL;
+       
+       if (!access_ok(VERIFY_READ, buf, count))
+               return -EFAULT;
+
+       ri = RCFS_I(file->f_dentry->d_parent->d_inode);
+
+       if (!ri || !ckrm_is_core_valid((ckrm_core_class_t *)(ri->core))) {
+               printk(KERN_ERR "shares_write: Error accessing core class\n");
+               return -EFAULT;
+       }
+       
+       down(&inode->i_sem);
+       
+       core = ri->core; 
+       optbuf = kmalloc(SHARES_MAX_INPUT_SIZE, GFP_KERNEL);
+       __copy_from_user(optbuf, buf, count);
+       if (optbuf[count-1] == '\n')
+               optbuf[count-1]='\0';
+
+       done = shares_parse(optbuf, &resname, &newshares);
+       if (!done) {
+               printk(KERN_ERR "Error parsing shares\n");
+               rc = -EINVAL;
+               goto write_out;
+       }
+
+       if (core->classtype->set_shares) {
+               rc = (*core->classtype->set_shares)(core,resname,&newshares);
+               if (rc) {
+                       printk(KERN_ERR "shares_write: resctlr share set error\n");
+                       goto write_out;
+               }
+       }
+       
+       printk(KERN_ERR "Set %s shares to %d %d %d %d\n",
+              resname,
+              newshares.my_guarantee, 
+              newshares.my_limit, 
+              newshares.total_guarantee,
+              newshares.max_limit);
+      
+       rc = count ;
+
+write_out:     
+
+       up(&inode->i_sem);
+       kfree(optbuf);
+       kfree(resname);
+       return rc;
+}
+
+
+MAGIC_SHOW(shares);
+MAGIC_OPEN(shares);
+MAGIC_CLOSE(shares);
+
+MAGIC_RDWR_FILEOPS(shares);
+
+
+
+/*
+ * magic file creation/deletion
+ *
+ */
+
+
+int 
+rcfs_clear_magic(struct dentry *parent)
+{
+       struct dentry *mftmp, *mfdentry ;
+
+       list_for_each_entry_safe(mfdentry, mftmp, &parent->d_subdirs, d_child) {
+               
+               if (!rcfs_is_magic(mfdentry))
+                       continue ;
+
+               if (rcfs_delete_internal(mfdentry)) 
+                       printk(KERN_ERR "rcfs_clear_magic: error deleting one\n");
+       }
+
+       return 0;
+  
+}
+EXPORT_SYMBOL(rcfs_clear_magic);
+
+
+int 
+rcfs_create_magic(struct dentry *parent, struct rcfs_magf magf[], int count)
+{
+       int i;
+       struct dentry *mfdentry;
+
+       for (i=0; i<count; i++) {
+               mfdentry = rcfs_create_internal(parent, &magf[i],0);
+               if (IS_ERR(mfdentry)) {
+                       rcfs_clear_magic(parent);
+                       return -ENOMEM;
+               }
+               RCFS_I(mfdentry->d_inode)->core = RCFS_I(parent->d_inode)->core;
+               mfdentry->d_fsdata = &RCFS_IS_MAGIC;
+               if (magf[i].i_fop)
+                       mfdentry->d_inode->i_fop = magf[i].i_fop;
+               if (magf[i].i_op)
+                       mfdentry->d_inode->i_op = magf[i].i_op;
+       }
+       return 0;
+}
+EXPORT_SYMBOL(rcfs_create_magic);
diff --git a/fs/rcfs/rootdir.c b/fs/rcfs/rootdir.c
new file mode 100644 (file)
index 0000000..fe3415d
--- /dev/null
@@ -0,0 +1,244 @@
+/* 
+ * fs/rcfs/rootdir.c 
+ *
+ * Copyright (C)   Vivek Kashyap,   IBM Corp. 2004
+ *           
+ * 
+ * Functions for creating root directories and magic files 
+ * for classtypes and classification engines under rcfs
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Changes
+ *
+ * 08 April 2004
+ *        Created.
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <asm/namei.h>
+#include <linux/namespace.h>
+#include <linux/dcache.h>
+#include <linux/seq_file.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/parser.h>
+
+#include <asm/uaccess.h>
+
+#include <linux/rcfs.h>
+
+
+
+rbce_eng_callback_t rcfs_eng_callbacks = {
+       NULL, NULL
+};
+
+int
+rcfs_register_engine(rbce_eng_callback_t *rcbs)
+{
+       if (!rcbs->mkdir || rcfs_eng_callbacks.mkdir) {
+               return -EINVAL;
+       }
+       rcfs_eng_callbacks = *rcbs;
+       return 0;
+}
+EXPORT_SYMBOL(rcfs_register_engine);
+
+
+
+int
+rcfs_unregister_engine(rbce_eng_callback_t *rcbs)
+{
+       if (!rcbs->mkdir || !rcfs_eng_callbacks.mkdir ||
+                       (rcbs->mkdir != rcfs_eng_callbacks.mkdir)) {
+               return -EINVAL;
+       }
+       rcfs_eng_callbacks.mkdir = NULL;
+       rcfs_eng_callbacks.rmdir = NULL;
+       return 0;
+}
+EXPORT_SYMBOL(rcfs_unregister_engine);
+
+
+
+
+/* rcfs_mkroot
+ * Create and return a "root" dentry under /rcfs. Also create associated magic files 
+ *
+ * @mfdesc: array of rcfs_magf describing root dir and its magic files
+ * @count: number of entries in mfdesc
+ * @core:  core class to be associated with root
+ * @rootde: output parameter to return the newly created root dentry
+ */
+
+int 
+rcfs_mkroot(struct rcfs_magf *mfdesc, int mfcount, struct dentry **rootde)
+{
+       int sz;
+       struct rcfs_magf *rootdesc = &mfdesc[0];
+       struct dentry *dentry ;
+       struct rcfs_inode_info *rootri;
+
+       if ((mfcount < 0) || (!mfdesc))
+               return -EINVAL;
+       
+       rootdesc = &mfdesc[0];
+       printk("allocating classtype root <%s>\n",rootdesc->name);
+       dentry = rcfs_create_internal(rcfs_rootde, rootdesc,0);
+       
+       if (!dentry) {
+               printk(KERN_ERR "Could not create %s\n",rootdesc->name);
+               return -ENOMEM;
+       } 
+       
+       rootri = RCFS_I(dentry->d_inode);
+       sz = strlen(rootdesc->name) + strlen(RCFS_ROOT) + 2;
+       rootri->name = kmalloc(sz, GFP_KERNEL);
+       if (!rootri->name) {
+               printk(KERN_ERR "Error allocating name for %s\n",
+                      rootdesc->name);
+               rcfs_delete_internal(dentry);
+               return -ENOMEM;
+       }
+       snprintf(rootri->name,sz,"%s/%s",RCFS_ROOT,rootdesc->name);
+       
+       if (rootdesc->i_fop)
+               dentry->d_inode->i_fop = rootdesc->i_fop;
+       if (rootdesc->i_op)
+               dentry->d_inode->i_op = rootdesc->i_op;
+
+       // set output parameters
+       *rootde = dentry;
+
+       return 0;
+}
+EXPORT_SYMBOL(rcfs_mkroot);
+
+
+int 
+rcfs_rmroot(struct dentry *rootde)
+{
+       if (!rootde)
+               return -EINVAL;
+
+       rcfs_clear_magic(rootde);
+       kfree(RCFS_I(rootde->d_inode)->name);
+       rcfs_delete_internal(rootde);
+       return 0;
+}
+EXPORT_SYMBOL(rcfs_rmroot);
+
+
+int 
+rcfs_register_classtype(ckrm_classtype_t *clstype)
+{
+       int rc ;
+       struct rcfs_inode_info *rootri;
+       struct rcfs_magf *mfdesc;
+
+       // Initialize mfdesc, mfcount 
+       clstype->mfdesc = (void *) genmfdesc[clstype->mfidx]->rootmf;
+        clstype->mfcount = genmfdesc[clstype->mfidx]->rootmflen;
+
+       mfdesc = (struct rcfs_magf *)clstype->mfdesc;
+       
+       /* rcfs root entry has the same name as the classtype */
+       strncpy(mfdesc[0].name,clstype->name,RCFS_MAGF_NAMELEN) ;
+
+       rc = rcfs_mkroot(mfdesc,clstype->mfcount,
+                               (struct dentry **)&(clstype->rootde));
+       if (rc)
+               return rc;
+
+       rootri = RCFS_I(((struct dentry *)(clstype->rootde))->d_inode);
+       rootri->core = clstype->default_class;
+       clstype->default_class->name = rootri->name;
+       ckrm_core_grab(clstype->default_class);
+       
+       // Create magic files under root 
+       if ((rc = rcfs_create_magic(clstype->rootde, &mfdesc[1], 
+                                   clstype->mfcount-1))) {
+               kfree(rootri->name);
+               rcfs_delete_internal(clstype->rootde);
+               return rc;
+       }
+
+       return rc;
+}
+EXPORT_SYMBOL(rcfs_register_classtype);
+
+
+int 
+rcfs_deregister_classtype(ckrm_classtype_t *clstype)
+{
+       int rc;
+
+       rc = rcfs_rmroot((struct dentry *)clstype->rootde);
+       if (!rc) {
+               clstype->default_class->name = NULL ;
+               ckrm_core_drop(clstype->default_class);
+       }
+       return rc;
+}
+EXPORT_SYMBOL(rcfs_deregister_classtype);
+
+
+
+// Common root and magic file entries.
+// root name, root permissions, magic file names and magic file permissions are needed by
+// all entities (classtypes and classification engines) existing under the rcfs mount point
+
+// The common sets of these attributes are listed here as a table. Individual classtypes and
+// classification engines can simple specify the index into the table to initialize their
+// magf entries. 
+//
+
+#ifdef CONFIG_CKRM_TYPE_TASKCLASS
+extern struct rcfs_mfdesc tc_mfdesc;
+#endif
+
+#ifdef CONFIG_CKRM_TYPE_TASKCLASS
+extern struct rcfs_mfdesc sock_mfdesc;
+#endif
+
+// extern struct rcfs_magf rbce_mfdesc;
+
+
+struct rcfs_mfdesc *genmfdesc[]={
+#ifdef CONFIG_CKRM_TYPE_TASKCLASS
+       &tc_mfdesc,
+#else
+       NULL,
+#endif
+#ifdef CONFIG_CKRM_TYPE_SOCKETCLASS
+       &sock_mfdesc,
+#else
+       NULL,
+#endif
+// Create similar entry for RBCE ? 
+//#ifdef CONFIG_CKRM_CE
+//     &rbce_mfdesc,
+//#else
+//     NULL,
+//#endif
+
+};
+
+
+
+
diff --git a/fs/rcfs/socket_fs.c b/fs/rcfs/socket_fs.c
new file mode 100644 (file)
index 0000000..492fb09
--- /dev/null
@@ -0,0 +1,338 @@
+/* ckrm_socketaq.c 
+ *
+ * Copyright (C) Vivek Kashyap,      IBM Corp. 2004
+ * 
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Changes
+ * Initial version
+ */
+
+/*******************************************************************************
+ *  Socket class type
+ *   
+ * Defines the root structure for socket based classes. Currently only inbound
+ * connection control is supported based on prioritized accept queues. 
+ ******************************************************************************/
+
+
+#include <linux/rcfs.h>
+#include <net/tcp.h>
+
+extern int rcfs_create(struct inode *,struct dentry *, int, struct nameidata *);
+extern int rcfs_unlink(struct inode *, struct dentry *);
+extern int  rcfs_symlink(struct inode *, struct dentry *, const char *);
+extern int rcfs_mknod(struct inode *, struct dentry *, int mode, dev_t);
+extern int rcfs_mkdir(struct inode *, struct dentry *, int);
+extern int rcfs_rmdir(struct inode *, struct dentry *);
+extern int rcfs_rename(struct inode *, struct dentry *, struct inode *, 
+               struct dentry *);
+
+extern int rcfs_create_coredir(struct inode *, struct dentry *);
+int sock_mkdir(struct inode *, struct dentry *, int mode);
+int sock_rmdir(struct inode *, struct dentry *);
+
+
+int sock_create_noperm(struct inode *, struct dentry *,int, struct nameidata *);
+int sock_unlink_noperm(struct inode *,struct dentry *);
+int sock_mkdir_noperm(struct inode *,struct dentry *,int);
+int sock_rmdir_noperm(struct inode *,struct dentry *);
+int sock_mknod_noperm(struct inode *,struct dentry *,int, dev_t);
+
+void sock_set_directory(void);
+
+extern struct file_operations config_fileops,
+                       members_fileops,
+                       shares_fileops,
+                       stats_fileops,
+                       target_fileops;
+
+
+struct inode_operations my_iops = {
+               .create         = rcfs_create,
+               .lookup         = simple_lookup,
+               .link           = simple_link,
+               .unlink         = rcfs_unlink,
+               .symlink        = rcfs_symlink,
+               .mkdir          = sock_mkdir,
+               .rmdir          = sock_rmdir,
+               .mknod          = rcfs_mknod,
+               .rename         = rcfs_rename,
+};
+
+struct inode_operations class_iops = {
+               .create         = sock_create_noperm,
+               .lookup         = simple_lookup,
+               .link           = simple_link,
+               .unlink         = sock_unlink_noperm,
+               .symlink        = rcfs_symlink,
+               .mkdir          = sock_mkdir_noperm,
+               .rmdir          = sock_rmdir_noperm,
+               .mknod          = sock_mknod_noperm,
+               .rename         = rcfs_rename,
+};
+
+struct inode_operations sub_iops = {
+               .create         = sock_create_noperm,
+               .lookup         = simple_lookup,
+               .link           = simple_link,
+               .unlink         = sock_unlink_noperm,
+               .symlink        = rcfs_symlink,
+               .mkdir          = sock_mkdir_noperm,
+               .rmdir          = sock_rmdir_noperm,
+               .mknod          = sock_mknod_noperm,
+               .rename         = rcfs_rename,
+};
+
+struct rcfs_magf def_magf = {
+       .mode = RCFS_DEFAULT_DIR_MODE,
+       .i_op = &sub_iops,
+       .i_fop = NULL,
+};
+
+struct rcfs_magf sock_rootdesc[] = {
+       {
+       //      .name = should not be set, copy from classtype name,
+               .mode = RCFS_DEFAULT_DIR_MODE,
+               .i_op = &my_iops,
+               //.i_fop   = &simple_dir_operations,
+               .i_fop = NULL,
+       },
+       {
+               .name = "members",
+               .mode = RCFS_DEFAULT_FILE_MODE,
+               .i_op = &my_iops,
+               .i_fop = &members_fileops,
+       },
+       {
+               .name = "target",
+               .mode = RCFS_DEFAULT_FILE_MODE,
+               .i_op = &my_iops,
+               .i_fop = &target_fileops,
+       },
+};
+
+struct rcfs_magf sock_magf[] = {
+       {
+               .name = "config",
+               .mode = RCFS_DEFAULT_FILE_MODE,
+               .i_op = &my_iops,
+               .i_fop = &config_fileops,
+       },
+       {
+               .name = "members",
+               .mode = RCFS_DEFAULT_FILE_MODE,
+               .i_op = &my_iops,
+               .i_fop =&members_fileops,
+       },
+       {
+               .name = "shares",
+               .mode = RCFS_DEFAULT_FILE_MODE,
+               .i_op = &my_iops,
+               .i_fop = &shares_fileops,
+       },
+       {
+               .name = "stats",
+               .mode = RCFS_DEFAULT_FILE_MODE,
+               .i_op = &my_iops,
+               .i_fop = &stats_fileops,
+       },
+       {
+               .name = "target",
+               .mode = RCFS_DEFAULT_FILE_MODE,
+               .i_op = &my_iops,
+               .i_fop = &target_fileops,
+       },
+};
+
+struct rcfs_magf sub_magf[] = {
+       {
+               .name = "config",
+               .mode = RCFS_DEFAULT_FILE_MODE,
+               .i_op = &my_iops,
+               .i_fop = &config_fileops,
+       },
+       {
+               .name = "shares",
+               .mode = RCFS_DEFAULT_FILE_MODE,
+               .i_op = &my_iops,
+               .i_fop = &shares_fileops,
+       },
+       {
+               .name = "stats",
+               .mode = RCFS_DEFAULT_FILE_MODE,
+               .i_op = &my_iops,
+               .i_fop = &stats_fileops,
+       },
+};
+
+struct rcfs_mfdesc sock_mfdesc = {
+       .rootmf         = sock_rootdesc,
+       .rootmflen      = (sizeof(sock_rootdesc)/sizeof(struct rcfs_magf)),
+};
+
+
+#define SOCK_MAX_MAGF (sizeof(sock_magf)/sizeof(struct rcfs_magf))
+#define LAQ_MAX_SUBMAGF (sizeof(sub_magf)/sizeof(struct rcfs_magf))
+
+int 
+sock_rmdir(struct inode *p, struct dentry *me)
+{
+       struct dentry *mftmp, *mfdentry ;
+
+       // delete all magic sub directories
+       list_for_each_entry_safe(mfdentry, mftmp, &me->d_subdirs, d_child) {
+               if (S_ISDIR(mfdentry->d_inode->i_mode))
+                       rcfs_rmdir(me->d_inode, mfdentry);
+       }
+       // delete ourselves
+       rcfs_rmdir(p,me);
+
+       return 0;
+}
+
+#ifdef NUM_ACCEPT_QUEUES
+#define LAQ_NUM_ACCEPT_QUEUES NUM_ACCEPT_QUEUES
+#else
+#define LAQ_NUM_ACCEPT_QUEUES 0
+#endif
+
+int
+sock_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+       int retval = 0;
+       int i,j;
+       struct dentry *pentry, *mfdentry;
+
+       if (_rcfs_mknod(dir, dentry, mode | S_IFDIR, 0)) {
+               printk(KERN_ERR "rcfs_mkdir: error reaching parent\n");
+               return retval;
+       }
+       
+       // Needed if only _rcfs_mknod is used instead of i_op->mkdir
+       dir->i_nlink++;
+
+       retval = rcfs_create_coredir(dir, dentry);
+       if (retval) 
+               goto mkdir_err;
+
+       /* create the default set of magic files */
+       for (i =0; i < SOCK_MAX_MAGF; i++) {
+               mfdentry = rcfs_create_internal(dentry, &sock_magf[i],0);
+               mfdentry->d_fsdata = &RCFS_IS_MAGIC;
+               RCFS_I(mfdentry->d_inode)->core = 
+                               RCFS_I(dentry->d_inode)->core;
+               if (sock_magf[i].i_fop)
+                       mfdentry->d_inode->i_fop = sock_magf[i].i_fop;
+               if (sock_magf[i].i_op)
+                       mfdentry->d_inode->i_op = sock_magf[i].i_op;
+       }
+       
+       for (i=1; i < LAQ_NUM_ACCEPT_QUEUES; i++) {
+               j = sprintf(def_magf.name, "%d",i);
+               def_magf.name[j] = '\0';
+
+               pentry = rcfs_create_internal(dentry, &def_magf,0);
+               retval = rcfs_create_coredir(dentry->d_inode, pentry);
+               if (retval)
+                       goto mkdir_err;
+               for (j=0; j < LAQ_MAX_SUBMAGF; j++) {
+                       mfdentry = rcfs_create_internal(pentry, &sub_magf[j],0);
+                       mfdentry->d_fsdata = &RCFS_IS_MAGIC;
+                       RCFS_I(mfdentry->d_inode)->core = 
+                                       RCFS_I(pentry->d_inode)->core;
+                       if (sub_magf[j].i_fop)
+                               mfdentry->d_inode->i_fop = sub_magf[j].i_fop;
+                       if (sub_magf[j].i_op)
+                               mfdentry->d_inode->i_op = sub_magf[j].i_op;
+               }
+               pentry->d_inode->i_op = &sub_iops;
+       }
+       dentry->d_inode->i_op = &class_iops;
+       return 0;
+
+mkdir_err:
+       // Needed
+       dir->i_nlink--;
+       return retval;
+}
+#ifndef NUM_ACCEPT_QUEUES
+#define NUM_ACCEPT_QUEUES 0
+#endif
+
+char *
+sock_get_name(struct ckrm_core_class *c)
+{
+       char *p = (char *)c->name;
+       
+       while(*p)
+               p++;
+       while( *p != '/' && p != c->name)
+               p--;
+
+       return ++p;
+}
+
+int 
+sock_create_noperm(struct inode *dir,struct dentry *dentry,int mode, struct nameidata *nd)
+{
+       return -EPERM;
+}
+
+int 
+sock_unlink_noperm(struct inode *dir,struct dentry *dentry)
+{
+       return -EPERM;
+}
+
+int 
+sock_mkdir_noperm(struct inode *dir,struct dentry *dentry, int mode)
+{
+       return -EPERM;
+}
+
+int 
+sock_rmdir_noperm(struct inode *dir,struct dentry *dentry)
+{
+       return -EPERM;
+}
+
+int 
+sock_mknod_noperm(struct inode *dir,struct dentry *dentry,int mode, dev_t dev)
+{
+       return -EPERM;
+}
+
+#if 0
+void
+sock_set_directory()
+{
+       struct dentry *pentry, *dentry;
+
+       pentry = rcfs_set_magf_byname("listen_aq", (void *)&my_dir_magf[0]);
+       if (pentry) {
+               dentry = rcfs_create_internal(pentry, &my_dir_magf[1],0);
+               if (my_dir_magf[1].i_fop)
+                       dentry->d_inode->i_fop = my_dir_magf[1].i_fop;
+               RCFS_I(dentry->d_inode)->core = 
+                               RCFS_I(pentry->d_inode)->core;
+               dentry = rcfs_create_internal(pentry, &my_dir_magf[2],0);
+               if (my_dir_magf[2].i_fop)
+                       dentry->d_inode->i_fop = my_dir_magf[2].i_fop;
+               RCFS_I(dentry->d_inode)->core = 
+                               RCFS_I(pentry->d_inode)->core;
+       }
+       else  {
+               printk(KERN_ERR "Could not create /rcfs/listen_aq\n"
+                               "Perhaps /rcfs needs to be mounted\n");
+       }
+}
+#endif
+
diff --git a/fs/rcfs/super.c b/fs/rcfs/super.c
new file mode 100644 (file)
index 0000000..d0e78c4
--- /dev/null
@@ -0,0 +1,288 @@
+/* 
+ * fs/rcfs/super.c 
+ *
+ * Copyright (C) Shailabh Nagar,  IBM Corp. 2004
+ *              Vivek Kashyap,   IBM Corp. 2004
+ *           
+ * Super block operations for rcfs
+ * 
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Changes
+ *
+ * 08 Mar 2004
+ *        Created.
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <asm/namei.h>
+#include <linux/namespace.h>
+#include <linux/dcache.h>
+#include <linux/seq_file.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/parser.h>
+
+#include <asm/uaccess.h>
+
+#include <linux/rcfs.h>
+#include <linux/ckrm.h>
+
+
+static kmem_cache_t *rcfs_inode_cachep;
+
+
+inline struct rcfs_inode_info *RCFS_I(struct inode *inode)
+{
+       return container_of(inode, struct rcfs_inode_info, vfs_inode);
+}
+EXPORT_SYMBOL(RCFS_I);
+
+
+
+static struct inode *
+rcfs_alloc_inode(struct super_block *sb)
+{
+       struct rcfs_inode_info *ri;
+       ri = (struct rcfs_inode_info *) kmem_cache_alloc(rcfs_inode_cachep, 
+                                                        SLAB_KERNEL);
+       if (!ri)
+               return NULL;
+       ri->name = NULL;
+       return &ri->vfs_inode;
+}
+
+static void 
+rcfs_destroy_inode(struct inode *inode)
+{
+       struct rcfs_inode_info *ri = RCFS_I(inode);
+
+       kfree(ri->name);
+       kmem_cache_free(rcfs_inode_cachep, RCFS_I(inode));
+}
+
+static void 
+rcfs_init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+{
+       struct rcfs_inode_info *ri = (struct rcfs_inode_info *) foo;
+
+       if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+           SLAB_CTOR_CONSTRUCTOR)
+               inode_init_once(&ri->vfs_inode);
+}
+
+int 
+rcfs_init_inodecache(void)
+{
+       rcfs_inode_cachep = kmem_cache_create("rcfs_inode_cache",
+                               sizeof(struct rcfs_inode_info),
+                               0, SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT,
+                               rcfs_init_once, NULL);
+       if (rcfs_inode_cachep == NULL)
+               return -ENOMEM;
+       return 0;
+}
+
+void rcfs_destroy_inodecache(void)
+{
+       printk(KERN_WARNING "destroy inodecache was called\n");
+       if (kmem_cache_destroy(rcfs_inode_cachep))
+               printk(KERN_INFO "rcfs_inode_cache: not all structures were freed\n");
+}
+
+struct super_operations rcfs_super_ops =
+{
+       .alloc_inode    = rcfs_alloc_inode,
+       .destroy_inode  = rcfs_destroy_inode,
+       .statfs         = simple_statfs,
+       .drop_inode     = generic_delete_inode,
+};
+
+
+struct dentry *rcfs_rootde; /* redundant since one can also get it from sb */
+static struct inode *rcfs_root;
+static struct rcfs_inode_info *rcfs_rootri;
+
+static int rcfs_mounted;
+
+static int rcfs_fill_super(struct super_block * sb, void * data, int silent)
+{
+       struct inode * inode;
+       struct dentry * root;
+       struct rcfs_inode_info *rootri;
+       struct ckrm_classtype *clstype;
+       int i,rc;
+
+       sb->s_fs_info = NULL;
+       if (rcfs_mounted) {
+               return -EPERM;
+       }
+       rcfs_mounted++;
+
+       sb->s_blocksize = PAGE_CACHE_SIZE;
+       sb->s_blocksize_bits = PAGE_CACHE_SHIFT;        
+       sb->s_magic = RCFS_MAGIC;
+       sb->s_op = &rcfs_super_ops;
+       inode = rcfs_get_inode(sb, S_IFDIR | 0755, 0);
+       if (!inode)
+               return -ENOMEM;
+       inode->i_op = &rcfs_rootdir_inode_operations;
+
+       root = d_alloc_root(inode);
+       if (!root) {
+               iput(inode);
+               return -ENOMEM;
+       }
+       sb->s_root = root;
+
+       
+       // Link inode and core class 
+       rootri = RCFS_I(inode);
+       rootri->name = kmalloc(strlen(RCFS_ROOT) + 1, GFP_KERNEL);
+       if (!rootri->name) {
+               d_delete(root);
+               iput(inode);
+               return -ENOMEM;
+       }
+       strcpy(rootri->name, RCFS_ROOT);
+       rootri->core = NULL;
+
+       rcfs_root = inode;
+       sb->s_fs_info = rcfs_root = inode;
+       rcfs_rootde = root ;
+       rcfs_rootri = rootri ;
+
+       // register metatypes
+       for ( i=0; i<CKRM_MAX_CLASSTYPES; i++) {
+               clstype = ckrm_classtypes[i];
+               if (clstype == NULL) 
+                       continue;
+               printk("A non null classtype\n");
+
+               if ((rc = rcfs_register_classtype(clstype)))
+                       continue ;  // could return with an error too 
+       }
+
+       // register CE's with rcfs 
+       // check if CE loaded
+       // call rcfs_register_engine for each classtype
+       // AND rcfs_mkroot (preferably subsume latter in former) 
+
+       return 0;
+}
+
+
+static struct super_block *rcfs_get_sb(struct file_system_type *fs_type,
+       int flags, const char *dev_name, void *data)
+{
+       return get_sb_nodev(fs_type, flags, data, rcfs_fill_super);
+}
+
+
+void 
+rcfs_kill_sb(struct super_block *sb)
+{
+       int i,rc;
+       struct ckrm_classtype *clstype;
+
+       if (sb->s_fs_info != rcfs_root) {
+               generic_shutdown_super(sb);
+               return;
+       }
+       rcfs_mounted--;
+
+       for ( i=0; i < CKRM_MAX_CLASSTYPES; i++) {
+
+               clstype = ckrm_classtypes[i];
+               if (clstype == NULL || clstype->rootde == NULL) 
+                       continue;
+
+               if ((rc = rcfs_deregister_classtype(clstype))) {
+                       printk(KERN_ERR "Error removing classtype %s\n",
+                              clstype->name);
+                       // return ;   // can also choose to stop here
+               }
+       }
+       
+       // do not remove comment block until ce directory issue resolved
+       // deregister CE with rcfs
+       // Check if loaded
+       // if ce is in  one directory /rcfs/ce, 
+       //       rcfs_deregister_engine for all classtypes within above 
+       //             codebase 
+       //       followed by
+       //       rcfs_rmroot here
+       // if ce in multiple (per-classtype) directories
+       //       call rbce_deregister_engine within ckrm_deregister_classtype
+
+       // following will automatically clear rcfs root entry including its 
+       //  rcfs_inode_info
+
+       generic_shutdown_super(sb);
+
+       // printk(KERN_ERR "Removed all entries\n");
+}      
+
+
+static struct file_system_type rcfs_fs_type = {
+       .name           = "rcfs",
+       .get_sb         = rcfs_get_sb,
+       .kill_sb        = rcfs_kill_sb,
+};
+
+struct rcfs_functions my_rcfs_fn = {
+       .mkroot               = rcfs_mkroot,
+       .rmroot               = rcfs_rmroot,
+       .register_classtype   = rcfs_register_classtype,
+       .deregister_classtype = rcfs_deregister_classtype,
+};
+
+extern struct rcfs_functions rcfs_fn ;
+
+static int __init init_rcfs_fs(void)
+{
+       int ret;
+
+       ret = register_filesystem(&rcfs_fs_type);
+       if (ret)
+               goto init_register_err;
+
+       ret = rcfs_init_inodecache();
+       if (ret)
+               goto init_cache_err;
+
+       rcfs_fn = my_rcfs_fn ;
+       
+       return ret;
+
+init_cache_err:
+       unregister_filesystem(&rcfs_fs_type);
+init_register_err:
+       return ret;
+}
+
+static void __exit exit_rcfs_fs(void)
+{
+       rcfs_destroy_inodecache();
+       unregister_filesystem(&rcfs_fs_type);
+}
+
+module_init(init_rcfs_fs)
+module_exit(exit_rcfs_fs)
+
+MODULE_LICENSE("GPL");
diff --git a/fs/rcfs/tc_magic.c b/fs/rcfs/tc_magic.c
new file mode 100644 (file)
index 0000000..1686409
--- /dev/null
@@ -0,0 +1,94 @@
+/* 
+ * fs/rcfs/tc_magic.c 
+ *
+ * Copyright (C) Shailabh Nagar,      IBM Corp. 2004
+ *           (C) Vivek Kashyap,       IBM Corp. 2004
+ *           (C) Chandra Seetharaman, IBM Corp. 2004
+ *           (C) Hubertus Franke,     IBM Corp. 2004
+ *           
+ * 
+ * define magic fileops for taskclass classtype
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Changes
+ *
+ * 23 Apr 2004
+ *        Created.
+ *
+ */
+
+#include <linux/rcfs.h>
+#include <linux/ckrm_tc.h>
+
+
+/*******************************************************************************
+ * Taskclass general
+ *
+ * Define structures for taskclass root directory and its magic files 
+ * In taskclasses, there is one set of magic files, created automatically under
+ * the taskclass root (upon classtype registration) and each directory (class) 
+ * created subsequently. However, classtypes can also choose to have different 
+ * sets of magic files created under their root and other directories under root
+ * using their mkdir function. RCFS only provides helper functions for creating 
+ * the root directory and its magic files
+ * 
+ *******************************************************************************/
+
+#define TC_FILE_MODE (S_IFREG | S_IRUGO | S_IWUSR) 
+       
+#define NR_TCROOTMF  6
+struct rcfs_magf tc_rootdesc[NR_TCROOTMF] = {
+       /* First entry must be root */
+       { 
+//             .name    = should not be set, copy from classtype name
+               .mode    = RCFS_DEFAULT_DIR_MODE,
+               .i_op    = &rcfs_dir_inode_operations,
+               .i_fop   = &simple_dir_operations,
+       },
+       /* Rest are root's magic files */
+       { 
+               .name    =  "target", 
+               .mode    = TC_FILE_MODE, 
+               .i_fop   = &target_fileops,
+               .i_op    = &rcfs_file_inode_operations,
+       },
+       { 
+               .name    =  "config", 
+               .mode    = TC_FILE_MODE, 
+               .i_fop   = &config_fileops, 
+               .i_op    = &rcfs_file_inode_operations,
+       },
+       { 
+               .name    =  "members", 
+               .mode    = TC_FILE_MODE, 
+               .i_fop   = &members_fileops,
+               .i_op    = &rcfs_file_inode_operations,
+       },
+       { 
+               .name    =  "stats", 
+               .mode    = TC_FILE_MODE, 
+               .i_fop   = &stats_fileops, 
+               .i_op    = &rcfs_file_inode_operations,
+       },
+       { 
+               .name    =  "shares", 
+               .mode    = TC_FILE_MODE,
+               .i_fop   = &shares_fileops, 
+               .i_op    = &rcfs_file_inode_operations,
+       },
+};
+
+struct rcfs_mfdesc tc_mfdesc = {
+       .rootmf          = tc_rootdesc,
+       .rootmflen       = NR_TCROOTMF,
+};
+
+
diff --git a/fs/relayfs/Makefile b/fs/relayfs/Makefile
new file mode 100644 (file)
index 0000000..09f098a
--- /dev/null
@@ -0,0 +1,8 @@
+#
+# relayfs Makefile
+#
+
+obj-$(CONFIG_RELAYFS_FS) += relayfs.o
+
+relayfs-y := relay.o relay_lockless.o relay_locking.o inode.o resize.o
+relayfs-$(CONFIG_KLOG_CHANNEL) += klog.o
diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
new file mode 100644 (file)
index 0000000..6e87360
--- /dev/null
@@ -0,0 +1,629 @@
+/*
+ * VFS-related code for RelayFS, a high-speed data relay filesystem.
+ *
+ * Copyright (C) 2003 - Tom Zanussi <zanussi@us.ibm.com>, IBM Corp
+ * Copyright (C) 2003 - Karim Yaghmour <karim@opersys.com>
+ *
+ * Based on ramfs, Copyright (C) 2002 - Linus Torvalds
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+#include <asm/uaccess.h>
+#include <asm/relay.h>
+
+#define RELAYFS_MAGIC                  0x26F82121
+
+static struct super_operations         relayfs_ops;
+static struct address_space_operations relayfs_aops;
+static struct inode_operations         relayfs_file_inode_operations;
+static struct file_operations          relayfs_file_operations;
+static struct inode_operations         relayfs_dir_inode_operations;
+
+static struct vfsmount *               relayfs_mount;
+static int                             relayfs_mount_count;
+
+static struct backing_dev_info         relayfs_backing_dev_info = {
+       .ra_pages       = 0,    /* No readahead */
+       .memory_backed  = 1,    /* Does not contribute to dirty memory */
+};
+
+static struct inode *
+relayfs_get_inode(struct super_block *sb, int mode, dev_t dev)
+{
+       struct inode * inode;
+       
+       inode = new_inode(sb);
+
+       if (inode) {
+               inode->i_mode = mode;
+               inode->i_uid = current->fsuid;
+               inode->i_gid = current->fsgid;
+               inode->i_blksize = PAGE_CACHE_SIZE;
+               inode->i_blocks = 0;
+               inode->i_mapping->a_ops = &relayfs_aops;
+               inode->i_mapping->backing_dev_info = &relayfs_backing_dev_info;
+               inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+               switch (mode & S_IFMT) {
+               default:
+                       init_special_inode(inode, mode, dev);
+                       break;
+               case S_IFREG:
+                       inode->i_op = &relayfs_file_inode_operations;
+                       inode->i_fop = &relayfs_file_operations;
+                       break;
+               case S_IFDIR:
+                       inode->i_op = &relayfs_dir_inode_operations;
+                       inode->i_fop = &simple_dir_operations;
+
+                       /* directory inodes start off with i_nlink == 2 (for "." entry) */
+                       inode->i_nlink++;
+                       break;
+               case S_IFLNK:
+                       inode->i_op = &page_symlink_inode_operations;
+                       break;
+               }
+       }
+       return inode;
+}
+
+/*
+ * File creation. Allocate an inode, and we're done..
+ */
+/* SMP-safe */
+static int 
+relayfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+{
+       struct inode * inode;
+       int error = -ENOSPC;
+
+       inode = relayfs_get_inode(dir->i_sb, mode, dev);
+
+       if (inode) {
+               d_instantiate(dentry, inode);
+               dget(dentry);   /* Extra count - pin the dentry in core */
+               error = 0;
+       }
+       return error;
+}
+
+static int 
+relayfs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+{
+       int retval;
+
+       retval = relayfs_mknod(dir, dentry, mode | S_IFDIR, 0);
+
+       if (!retval)
+               dir->i_nlink++;
+       return retval;
+}
+
+static int 
+relayfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
+{
+       return relayfs_mknod(dir, dentry, mode | S_IFREG, 0);
+}
+
+static int 
+relayfs_symlink(struct inode * dir, struct dentry *dentry, const char * symname)
+{
+       struct inode *inode;
+       int error = -ENOSPC;
+
+       inode = relayfs_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0);
+
+       if (inode) {
+               int l = strlen(symname)+1;
+               error = page_symlink(inode, symname, l);
+               if (!error) {
+                       d_instantiate(dentry, inode);
+                       dget(dentry);
+               } else
+                       iput(inode);
+       }
+       return error;
+}
+
+/**
+ *     relayfs_create_entry - create a relayfs directory or file
+ *     @name: the name of the file to create
+ *     @parent: parent directory
+ *     @dentry: result dentry
+ *     @entry_type: type of file to create (S_IFREG, S_IFDIR)
+ *     @mode: mode
+ *     @data: data to associate with the file
+ *
+ *     Creates a file or directory with the specifed permissions.
+ */
+static int 
+relayfs_create_entry(const char * name, struct dentry * parent, struct dentry **dentry, int entry_type, int mode, void * data)
+{
+       struct qstr qname;
+       struct dentry * d;
+       
+       int error = 0;
+
+       error = simple_pin_fs("relayfs", &relayfs_mount, &relayfs_mount_count);
+       if (error) {
+               printk(KERN_ERR "Couldn't mount relayfs: errcode %d\n", error);
+               return error;
+       }
+
+       qname.name = name;
+       qname.len = strlen(name);
+       qname.hash = full_name_hash(name, qname.len);
+
+       if (parent == NULL)
+               if (relayfs_mount && relayfs_mount->mnt_sb)
+                       parent = relayfs_mount->mnt_sb->s_root;
+
+       if (parent == NULL) {
+               simple_release_fs(&relayfs_mount, &relayfs_mount_count);
+               return -EINVAL;
+       }
+
+       parent = dget(parent);
+       down(&parent->d_inode->i_sem);
+       d = lookup_hash(&qname, parent);
+       if (IS_ERR(d)) {
+               error = PTR_ERR(d);
+               goto release_mount;
+       }
+       
+       if (d->d_inode) {
+               error = -EEXIST;
+               goto release_mount;
+       }
+
+       if (entry_type == S_IFREG)
+               error = relayfs_create(parent->d_inode, d, entry_type | mode, NULL);
+       else
+               error = relayfs_mkdir(parent->d_inode, d, entry_type | mode);
+       if (error)
+               goto release_mount;
+
+       if ((entry_type == S_IFREG) && data) {
+               d->d_inode->u.generic_ip = data;
+               goto exit; /* don't release mount for regular files */
+       }
+
+release_mount:
+       simple_release_fs(&relayfs_mount, &relayfs_mount_count);
+exit:  
+       *dentry = d;
+       up(&parent->d_inode->i_sem);
+       dput(parent);
+
+       return error;
+}
+
+/**
+ *     relayfs_create_file - create a file in the relay filesystem
+ *     @name: the name of the file to create
+ *     @parent: parent directory
+ *     @dentry: result dentry
+ *     @data: data to associate with the file
+ *     @mode: mode, if not specied the default perms are used
+ *
+ *     The file will be created user rw on behalf of current user.
+ */
+int 
+relayfs_create_file(const char * name, struct dentry * parent, struct dentry **dentry, void * data, int mode)
+{
+       if (!mode)
+               mode = S_IRUSR | S_IWUSR;
+       
+       return relayfs_create_entry(name, parent, dentry, S_IFREG,
+                                   mode, data);
+}
+
+/**
+ *     relayfs_create_dir - create a directory in the relay filesystem
+ *     @name: the name of the directory to create
+ *     @parent: parent directory
+ *     @dentry: result dentry
+ *
+ *     The directory will be created world rwx on behalf of current user.
+ */
+int 
+relayfs_create_dir(const char * name, struct dentry * parent, struct dentry **dentry)
+{
+       return relayfs_create_entry(name, parent, dentry, S_IFDIR,
+                                   S_IRWXU | S_IRUGO | S_IXUGO, NULL);
+}
+
+/**
+ *     relayfs_remove_file - remove a file in the relay filesystem
+ *     @dentry: file dentry
+ *
+ *     Remove a file previously created by relayfs_create_file.
+ */
+int 
+relayfs_remove_file(struct dentry *dentry)
+{
+       struct dentry *parent;
+       int is_reg;
+       
+       parent = dentry->d_parent;
+       if (parent == NULL)
+               return -EINVAL;
+
+       is_reg = S_ISREG(dentry->d_inode->i_mode);
+
+       parent = dget(parent);
+       down(&parent->d_inode->i_sem);
+       if (dentry->d_inode) {
+               simple_unlink(parent->d_inode, dentry);
+               d_delete(dentry);
+       }
+       dput(dentry);
+       up(&parent->d_inode->i_sem);
+       dput(parent);
+
+       if(is_reg)
+               simple_release_fs(&relayfs_mount, &relayfs_mount_count);
+
+       return 0;
+}
+
+/**
+ *     relayfs_open - open file op for relayfs files
+ *     @inode: the inode
+ *     @filp: the file
+ *
+ *     Associates the channel with the file, and increments the
+ *     channel refcount.  Reads will be 'auto-consuming'.
+ */
+int
+relayfs_open(struct inode *inode, struct file *filp)
+{
+       struct rchan *rchan;
+       struct rchan_reader *reader;
+       int retval = 0;
+
+       if (inode->u.generic_ip) {
+               rchan = (struct rchan *)inode->u.generic_ip;
+               if (rchan == NULL)
+                       return -EACCES;
+               reader = __add_rchan_reader(rchan, filp, 1, 0);
+               if (reader == NULL)
+                       return -ENOMEM;
+               filp->private_data = reader;
+               retval = rchan->callbacks->fileop_notify(rchan->id, filp,
+                                                        RELAY_FILE_OPEN);
+               if (retval == 0)
+                       /* Inc relay channel refcount for file */
+                       rchan_get(rchan->id);
+               else {
+                       __remove_rchan_reader(reader);
+                       retval = -EPERM;
+               }
+       }
+
+       return retval;
+}
+
+/**
+ *     relayfs_mmap - mmap file op for relayfs files
+ *     @filp: the file
+ *     @vma: the vma describing what to map
+ *
+ *     Calls upon relay_mmap_buffer to map the file into user space.
+ */
+int 
+relayfs_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+       struct rchan *rchan;
+       
+       rchan = ((struct rchan_reader *)filp->private_data)->rchan;
+
+       return __relay_mmap_buffer(rchan, vma);
+}
+
+/**
+ *     relayfs_file_read - read file op for relayfs files
+ *     @filp: the file
+ *     @buf: user buf to read into
+ *     @count: bytes requested
+ *     @offset: offset into file
+ *
+ *     Reads count bytes from the channel, or as much as is available within
+ *     the sub-buffer currently being read.  Reads are 'auto-consuming'.
+ *     See relay_read() for details.
+ *
+ *     Returns bytes read on success, 0 or -EAGAIN if nothing available,
+ *     negative otherwise.
+ */
+ssize_t 
+relayfs_file_read(struct file *filp, char * buf, size_t count, loff_t *offset)
+{
+       size_t read_count;
+       struct rchan_reader *reader;
+       u32 dummy; /* all VFS readers are auto-consuming */
+
+       if (offset != &filp->f_pos) /* pread, seeking not supported */
+               return -ESPIPE;
+
+       if (count == 0)
+               return 0;
+
+       reader = (struct rchan_reader *)filp->private_data;
+       read_count = relay_read(reader, buf, count,
+               filp->f_flags & (O_NDELAY | O_NONBLOCK) ? 0 : 1, &dummy);
+
+       return read_count;
+}
+
+/**
+ *     relayfs_file_write - write file op for relayfs files
+ *     @filp: the file
+ *     @buf: user buf to write from
+ *     @count: bytes to write
+ *     @offset: offset into file
+ *
+ *     Reserves a slot in the relay buffer and writes count bytes
+ *     into it.  The current limit for a single write is 2 pages
+ *     worth.  The user_deliver() channel callback will be invoked on
+ *     
+ *     Returns bytes written on success, 0 or -EAGAIN if nothing available,
+ *     negative otherwise.
+ */
+ssize_t 
+relayfs_file_write(struct file *filp, const char *buf, size_t count, loff_t *offset)
+{
+       int write_count;
+       char * write_buf;
+       struct rchan *rchan;
+       int err = 0;
+       void *wrote_pos;
+       struct rchan_reader *reader;
+
+       reader = (struct rchan_reader *)filp->private_data;
+       if (reader == NULL)
+               return -EPERM;
+
+       rchan = reader->rchan;
+       if (rchan == NULL)
+               return -EPERM;
+
+       if (count == 0)
+               return 0;
+
+       /* Change this if need to write more than 2 pages at once */
+       if (count > 2 * PAGE_SIZE)
+               return -EINVAL;
+       
+       write_buf = (char *)__get_free_pages(GFP_KERNEL, 1);
+       if (write_buf == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(write_buf, buf, count))
+               return -EFAULT;
+
+       if (filp->f_flags & (O_NDELAY | O_NONBLOCK)) {
+               write_count = relay_write(rchan->id, write_buf, count, -1, &wrote_pos);
+               if (write_count == 0)
+                       return -EAGAIN;
+       } else {
+               err = wait_event_interruptible(rchan->write_wait,
+                (write_count = relay_write(rchan->id, write_buf, count, -1, &wrote_pos)));
+               if (err)
+                       return err;
+       }
+       
+       free_pages((unsigned long)write_buf, 1);
+       
+        rchan->callbacks->user_deliver(rchan->id, wrote_pos, write_count);
+
+       return write_count;
+}
+
+/**
+ *     relayfs_ioctl - ioctl file op for relayfs files
+ *     @inode: the inode
+ *     @filp: the file
+ *     @cmd: the command
+ *     @arg: command arg
+ *
+ *     Passes the specified cmd/arg to the kernel client.  arg may be a 
+ *     pointer to user-space data, in which case the kernel client is 
+ *     responsible for copying the data to/from user space appropriately.
+ *     The kernel client is also responsible for returning a meaningful
+ *     return value for ioctl calls.
+ *     
+ *     Returns result of relay channel callback, -EPERM if unsuccessful.
+ */
+int
+relayfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg)
+{
+       struct rchan *rchan;
+       struct rchan_reader *reader;
+
+       reader = (struct rchan_reader *)filp->private_data;
+       if (reader == NULL)
+               return -EPERM;
+
+       rchan = reader->rchan;
+       if (rchan == NULL)
+               return -EPERM;
+
+       return rchan->callbacks->ioctl(rchan->id, cmd, arg);
+}
+
+/**
+ *     relayfs_poll - poll file op for relayfs files
+ *     @filp: the file
+ *     @wait: poll table
+ *
+ *     Poll implemention.
+ */
+static unsigned int
+relayfs_poll(struct file *filp, poll_table *wait)
+{
+       struct rchan_reader *reader;
+       unsigned int mask = 0;
+       
+       reader = (struct rchan_reader *)filp->private_data;
+
+       if (reader->rchan->finalized)
+               return POLLERR;
+
+       if (filp->f_mode & FMODE_READ) {
+               poll_wait(filp, &reader->rchan->read_wait, wait);
+               if (!rchan_empty(reader))
+                       mask |= POLLIN | POLLRDNORM;
+       }
+       
+       if (filp->f_mode & FMODE_WRITE) {
+               poll_wait(filp, &reader->rchan->write_wait, wait);
+               if (!rchan_full(reader))
+                       mask |= POLLOUT | POLLWRNORM;
+       }
+       
+       return mask;
+}
+
+/**
+ *     relayfs_release - release file op for relayfs files
+ *     @inode: the inode
+ *     @filp: the file
+ *
+ *     Decrements the channel refcount, as the filesystem is
+ *     no longer using it.
+ */
+int
+relayfs_release(struct inode *inode, struct file *filp)
+{
+       struct rchan_reader *reader;
+       struct rchan *rchan;
+
+       reader = (struct rchan_reader *)filp->private_data;
+       if (reader == NULL || reader->rchan == NULL)
+               return 0;
+       rchan = reader->rchan;
+       
+        rchan->callbacks->fileop_notify(reader->rchan->id, filp,
+                                       RELAY_FILE_CLOSE);
+       __remove_rchan_reader(reader);
+       /* The channel is no longer in use as far as this file is concerned */
+       rchan_put(rchan);
+
+       return 0;
+}
+
+static struct address_space_operations relayfs_aops = {
+       .readpage       = simple_readpage,
+       .prepare_write  = simple_prepare_write,
+       .commit_write   = simple_commit_write
+};
+
+static struct file_operations relayfs_file_operations = {
+       .open           = relayfs_open,
+       .read           = relayfs_file_read,
+       .write          = relayfs_file_write,
+       .ioctl          = relayfs_ioctl,
+       .poll           = relayfs_poll,
+       .mmap           = relayfs_mmap,
+       .fsync          = simple_sync_file,
+       .release        = relayfs_release,
+};
+
+static struct inode_operations relayfs_file_inode_operations = {
+       .getattr        = simple_getattr,
+};
+
+static struct inode_operations relayfs_dir_inode_operations = {
+       .create         = relayfs_create,
+       .lookup         = simple_lookup,
+       .link           = simple_link,
+       .unlink         = simple_unlink,
+       .symlink        = relayfs_symlink,
+       .mkdir          = relayfs_mkdir,
+       .rmdir          = simple_rmdir,
+       .mknod          = relayfs_mknod,
+       .rename         = simple_rename,
+};
+
+static struct super_operations relayfs_ops = {
+       .statfs         = simple_statfs,
+       .drop_inode     = generic_delete_inode,
+};
+
+static int 
+relayfs_fill_super(struct super_block * sb, void * data, int silent)
+{
+       struct inode * inode;
+       struct dentry * root;
+
+       sb->s_blocksize = PAGE_CACHE_SIZE;
+       sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+       sb->s_magic = RELAYFS_MAGIC;
+       sb->s_op = &relayfs_ops;
+       inode = relayfs_get_inode(sb, S_IFDIR | 0755, 0);
+
+       if (!inode)
+               return -ENOMEM;
+
+       root = d_alloc_root(inode);
+       if (!root) {
+               iput(inode);
+               return -ENOMEM;
+       }
+       sb->s_root = root;
+
+       return 0;
+}
+
+static struct super_block *
+relayfs_get_sb(struct file_system_type *fs_type,
+       int flags, const char *dev_name, void *data)
+{
+       return get_sb_single(fs_type, flags, data, relayfs_fill_super);
+}
+
+static struct file_system_type relayfs_fs_type = {
+       .owner          = THIS_MODULE,
+       .name           = "relayfs",
+       .get_sb         = relayfs_get_sb,
+       .kill_sb        = kill_litter_super,
+};
+
+static int __init 
+init_relayfs_fs(void)
+{
+       int err = register_filesystem(&relayfs_fs_type);
+#ifdef CONFIG_KLOG_CHANNEL
+       if (!err)
+               create_klog_channel();
+#endif
+       return err;
+}
+
+static void __exit 
+exit_relayfs_fs(void)
+{
+#ifdef CONFIG_KLOG_CHANNEL
+       remove_klog_channel();
+#endif
+       unregister_filesystem(&relayfs_fs_type);
+}
+
+module_init(init_relayfs_fs)
+module_exit(exit_relayfs_fs)
+
+MODULE_AUTHOR("Tom Zanussi <zanussi@us.ibm.com> and Karim Yaghmour <karim@opersys.com>");
+MODULE_DESCRIPTION("Relay Filesystem");
+MODULE_LICENSE("GPL");
+
diff --git a/fs/relayfs/klog.c b/fs/relayfs/klog.c
new file mode 100644 (file)
index 0000000..3f2d31d
--- /dev/null
@@ -0,0 +1,206 @@
+/*
+ * KLOG                Generic Logging facility built upon the relayfs infrastructure
+ *
+ * Authors:    Hubertus Franke  (frankeh@us.ibm.com)
+ *             Tom Zanussi  (zanussi@us.ibm.com)
+ *
+ *             Please direct all questions/comments to zanussi@us.ibm.com
+ *
+ *             Copyright (C) 2003, IBM Corp
+ *
+ *             This program is free software; you can redistribute it and/or
+ *             modify it under the terms of the GNU General Public License
+ *             as published by the Free Software Foundation; either version
+ *             2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/smp_lock.h>
+#include <linux/console.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/config.h>
+#include <linux/delay.h>
+#include <linux/smp.h>
+#include <linux/sysctl.h>
+#include <linux/relayfs_fs.h>
+#include <linux/klog.h>
+
+/* klog channel id */
+static int klog_channel = -1;
+
+/* maximum size of klog formatting buffer beyond which truncation will occur */
+#define KLOG_BUF_SIZE (512)
+/* per-cpu klog formatting buffer */
+static char buf[NR_CPUS][KLOG_BUF_SIZE];
+
+/*
+ *     klog_enabled determines whether klog()/klog_raw() actually do write
+ *     to the klog channel at any given time. If klog_enabled == 1 they do,
+ *     otherwise they don't.  Settable using sysctl fs.relayfs.klog_enabled.
+ */
+#ifdef CONFIG_KLOG_CHANNEL_AUTOENABLE
+static int klog_enabled = 1;
+#else
+static int klog_enabled = 0;
+#endif
+
+/**
+ *     klog - write a formatted string into the klog channel
+ *     @fmt: format string
+ *
+ *     Returns number of bytes written, negative number on failure.
+ */
+int klog(const char *fmt, ...)
+{
+       va_list args;
+       int len, err;
+       char *cbuf;
+       unsigned long flags;
+
+       if (!klog_enabled || klog_channel < 0) 
+               return 0;
+
+       local_irq_save(flags);
+       cbuf = buf[smp_processor_id()];
+
+       va_start(args, fmt);
+       len = vsnprintf(cbuf, KLOG_BUF_SIZE, fmt, args);
+       va_end(args);
+       
+       err = relay_write(klog_channel, cbuf, len, -1, NULL);
+       local_irq_restore(flags);
+
+       return err;
+}
+
+/**
+ *     klog_raw - directly write into the klog channel
+ *     @buf: buffer containing data to write
+ *     @len: # bytes to write
+ *
+ *     Returns number of bytes written, negative number on failure.
+ */
+int klog_raw(const char *buf,int len)
+{
+       int err = 0;
+       
+       if (klog_enabled && klog_channel >= 0)
+               err = relay_write(klog_channel, buf, len, -1, NULL);
+
+       return err;
+}
+
+/**
+ *     relayfs sysctl data
+ *
+ *     Only sys/fs/relayfs/klog_enabled for now.
+ */
+#define CTL_ENABLE_KLOG                100
+#define CTL_RELAYFS            100
+
+static struct ctl_table_header *relayfs_ctl_table_header;
+
+static struct ctl_table relayfs_table[] =
+{
+       {
+               .ctl_name       = CTL_ENABLE_KLOG,
+               .procname       = "klog_enabled",
+               .data           = &klog_enabled,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
+       {
+               0
+       }
+};
+
+static struct ctl_table relayfs_dir_table[] =
+{
+       {
+               .ctl_name       = CTL_RELAYFS,
+               .procname       = "relayfs",
+               .data           = NULL,
+               .maxlen         = 0,
+               .mode           = 0555,
+               .child          = relayfs_table,
+       },
+       {
+               0
+       }
+};
+
+static struct ctl_table relayfs_root_table[] =
+{
+       {
+               .ctl_name       = CTL_FS,
+               .procname       = "fs",
+               .data           = NULL,
+               .maxlen         = 0,
+               .mode           = 0555,
+               .child          = relayfs_dir_table,
+       },
+       {
+               0
+       }
+};
+
+/**
+ *     create_klog_channel - creates channel /mnt/relay/klog
+ *
+ *     Returns channel id on success, negative otherwise.
+ */
+int 
+create_klog_channel(void)
+{
+       u32 bufsize, nbufs;
+       u32 channel_flags;
+
+       channel_flags = RELAY_DELIVERY_PACKET | RELAY_USAGE_GLOBAL;
+       channel_flags |= RELAY_SCHEME_ANY | RELAY_TIMESTAMP_ANY;
+
+       bufsize = 1 << (CONFIG_KLOG_CHANNEL_SHIFT - 2);
+       nbufs = 4;
+
+       klog_channel = relay_open("klog",
+                                 bufsize,
+                                 nbufs,
+                                 channel_flags,
+                                 NULL,
+                                 0,
+                                 0,
+                                 0,
+                                 0,
+                                 0,
+                                 0,
+                                 NULL,
+                                 0);
+
+       if (klog_channel < 0)
+               printk("klog channel creation failed, errcode: %d\n", klog_channel);
+       else {
+               printk("klog channel created (%u bytes)\n", 1 << CONFIG_KLOG_CHANNEL_SHIFT);
+               relayfs_ctl_table_header = register_sysctl_table(relayfs_root_table, 1);
+       }
+
+       return klog_channel;
+}
+
+/**
+ *     remove_klog_channel - destroys channel /mnt/relay/klog
+ *
+ *     Returns 0, negative otherwise.
+ */
+int
+remove_klog_channel(void)
+{
+       if (relayfs_ctl_table_header)
+               unregister_sysctl_table(relayfs_ctl_table_header);
+       
+       return relay_close(klog_channel);
+}
+
+EXPORT_SYMBOL(klog);
+EXPORT_SYMBOL(klog_raw);
+
diff --git a/fs/relayfs/relay.c b/fs/relayfs/relay.c
new file mode 100644 (file)
index 0000000..11f4636
--- /dev/null
@@ -0,0 +1,1911 @@
+/*
+ * Public API and common code for RelayFS.
+ *
+ * Please see Documentation/filesystems/relayfs.txt for API description.
+ * 
+ * Copyright (C) 2002, 2003 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
+ * Copyright (C) 1999, 2000, 2001, 2002 - Karim Yaghmour (karim@opersys.com)
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/time.h>
+#include <linux/page-flags.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/delay.h>
+
+#include <asm/io.h>
+#include <asm/current.h>
+#include <asm/uaccess.h>
+#include <asm/bitops.h>
+#include <asm/pgtable.h>
+#include <asm/relay.h>
+#include <asm/hardirq.h>
+
+#include "relay_lockless.h"
+#include "relay_locking.h"
+#include "resize.h"
+
+/* Relay channel table, indexed by channel id */
+static struct rchan *  rchan_table[RELAY_MAX_CHANNELS];
+static rwlock_t                rchan_table_lock = RW_LOCK_UNLOCKED;
+
+/* Relay operation structs, one per scheme */
+static struct relay_ops lockless_ops = {
+       .reserve = lockless_reserve,
+       .commit = lockless_commit,
+       .get_offset = lockless_get_offset,
+       .finalize = lockless_finalize,
+       .reset = lockless_reset,
+       .reset_index = lockless_reset_index
+};
+
+static struct relay_ops locking_ops = {
+       .reserve = locking_reserve,
+       .commit = locking_commit,
+       .get_offset = locking_get_offset,
+       .finalize = locking_finalize,
+       .reset = locking_reset,
+       .reset_index = locking_reset_index
+};
+
+/*
+ * Low-level relayfs kernel API.  These functions should not normally be 
+ * used by clients.  See high-level kernel API below.
+ */
+
+/**
+ *     rchan_get - get channel associated with id, incrementing refcount 
+ *     @rchan_id: the channel id
+ *
+ *     Returns channel if successful, NULL otherwise.
+ */
+struct rchan *
+rchan_get(int rchan_id)
+{
+       struct rchan *rchan;
+       
+       if ((rchan_id < 0) || (rchan_id >= RELAY_MAX_CHANNELS))
+               return NULL;
+       
+       read_lock(&rchan_table_lock);
+       rchan = rchan_table[rchan_id];
+       if (rchan)
+               atomic_inc(&rchan->refcount);
+       read_unlock(&rchan_table_lock);
+
+       return rchan;
+}
+
+/**
+ *     clear_readers - clear non-VFS readers
+ *     @rchan: the channel
+ *
+ *     Clear the channel pointers of all non-VFS readers open on the channel.
+ */
+static inline void
+clear_readers(struct rchan *rchan)
+{
+       struct list_head *p;
+       struct rchan_reader *reader;
+       
+       read_lock(&rchan->open_readers_lock);
+       list_for_each(p, &rchan->open_readers) {
+               reader = list_entry(p, struct rchan_reader, list);
+               if (!reader->vfs_reader)
+                       reader->rchan = NULL;
+       }
+       read_unlock(&rchan->open_readers_lock);
+}
+
+/**
+ *     rchan_alloc_id - reserve a channel id and store associated channel
+ *     @rchan: the channel
+ *
+ *     Returns channel id if successful, -1 otherwise.
+ */
+static inline int
+rchan_alloc_id(struct rchan *rchan)
+{
+       int i;
+       int rchan_id = -1;
+       
+       if (rchan == NULL)
+               return -1;
+
+       write_lock(&rchan_table_lock);
+       for (i = 0; i < RELAY_MAX_CHANNELS; i++) {
+               if (rchan_table[i] == NULL) {
+                       rchan_table[i] = rchan;
+                       rchan_id = rchan->id = i;
+                       break;
+               }
+       }
+       if (rchan_id != -1)
+               atomic_inc(&rchan->refcount);
+       write_unlock(&rchan_table_lock);
+       
+       return rchan_id;
+}
+
+/**
+ *     rchan_free_id - revoke a channel id and remove associated channel
+ *     @rchan_id: the channel id
+ */
+static inline void
+rchan_free_id(int rchan_id)
+{
+       struct rchan *rchan;
+
+       if ((rchan_id < 0) || (rchan_id >= RELAY_MAX_CHANNELS))
+               return;
+
+       write_lock(&rchan_table_lock);
+       rchan = rchan_table[rchan_id];
+       rchan_table[rchan_id] = NULL;
+       write_unlock(&rchan_table_lock);
+}
+
+/**
+ *     rchan_destroy_buf - destroy the current channel buffer
+ *     @rchan: the channel
+ */
+static inline void
+rchan_destroy_buf(struct rchan *rchan)
+{
+       if (rchan->buf && !rchan->init_buf)
+               free_rchan_buf(rchan->buf,
+                              rchan->buf_page_array,
+                              rchan->buf_page_count);
+}
+
+/**
+ *     relay_release - perform end-of-buffer processing for last buffer
+ *     @rchan: the channel
+ *
+ *     Returns 0 if successful, negative otherwise.
+ *
+ *     Releases the channel buffer, destroys the channel, and removes the
+ *     relay file from the relayfs filesystem.  Should only be called from 
+ *     rchan_put().  If we're here, it means by definition refcount is 0.
+ */
+static int 
+relay_release(struct rchan *rchan)
+{
+       if (rchan == NULL)
+               return -EBADF;
+
+       rchan_destroy_buf(rchan);
+       rchan_free_id(rchan->id);
+       relayfs_remove_file(rchan->dentry);
+       clear_readers(rchan);
+       kfree(rchan);
+
+       return 0;
+}
+
+/**
+ *     rchan_get - decrement channel refcount, releasing it if 0
+ *     @rchan: the channel
+ *
+ *     If the refcount reaches 0, the channel will be destroyed.
+ */
+void 
+rchan_put(struct rchan *rchan)
+{
+       if (atomic_dec_and_test(&rchan->refcount))
+               relay_release(rchan);
+}
+
+/**
+ *     relay_reserve -  reserve a slot in the channel buffer
+ *     @rchan: the channel
+ *     @len: the length of the slot to reserve
+ *     @td: the time delta between buffer start and current write, or TSC
+ *     @err: receives the result flags
+ *     @interrupting: 1 if interrupting previous, used only in locking scheme
+ *
+ *     Returns pointer to the beginning of the reserved slot, NULL if error.
+ *
+ *     The errcode value contains the result flags and is an ORed combination 
+ *     of the following:
+ *
+ *     RELAY_BUFFER_SWITCH_NONE - no buffer switch occurred
+ *     RELAY_EVENT_DISCARD_NONE - event should not be discarded
+ *     RELAY_BUFFER_SWITCH - buffer switch occurred
+ *     RELAY_EVENT_DISCARD - event should be discarded (all buffers are full)
+ *     RELAY_EVENT_TOO_LONG - event won't fit into even an empty buffer
+ *
+ *     buffer_start and buffer_end callbacks are triggered at this point
+ *     if applicable.
+ */
+char *
+relay_reserve(struct rchan *rchan,
+             u32 len,
+             struct timeval *ts,
+             u32 *td,
+             int *err,
+             int *interrupting)
+{
+       if (rchan == NULL)
+               return NULL;
+       
+       *interrupting = 0;
+
+       return rchan->relay_ops->reserve(rchan, len, ts, td, err, interrupting);
+}
+
+
+/**
+ *     wakeup_readers - wake up VFS readers waiting on a channel
+ *     @private: the channel
+ *
+ *     This is the work function used to defer reader waking.  The
+ *     reason waking is deferred is that calling directly from commit
+ *     causes problems if you're writing from say the scheduler.
+ */
+static void 
+wakeup_readers(void *private)
+{
+       struct rchan *rchan = (struct rchan *)private;
+
+       wake_up_interruptible(&rchan->read_wait);
+}
+
+
+/**
+ *     relay_commit - commit a reserved slot in the buffer
+ *     @rchan: the channel
+ *     @from: commit the length starting here
+ *     @len: length committed
+ *     @interrupting: 1 if interrupting previous, used only in locking scheme
+ *
+ *      After the write into the reserved buffer has been complted, this
+ *      function must be called in order for the relay to determine whether 
+ *      buffers are complete and to wake up VFS readers.
+ *
+ *     delivery callback is triggered at this point if applicable.
+ */
+void
+relay_commit(struct rchan *rchan,
+            char *from,
+            u32 len,
+            int reserve_code,
+            int interrupting)
+{
+       int deliver;
+
+       if (rchan == NULL)
+               return;
+       
+       deliver = packet_delivery(rchan) || 
+                  (reserve_code & RELAY_BUFFER_SWITCH);
+
+       rchan->relay_ops->commit(rchan, from, len, deliver, interrupting);
+
+       /* The params are always the same, so no worry about re-queuing */
+       if (deliver &&  waitqueue_active(&rchan->read_wait)) {
+               PREPARE_WORK(&rchan->wake_readers, wakeup_readers, rchan);
+               schedule_delayed_work(&rchan->wake_readers, 1);
+       }
+}
+
+/**
+ *     relay_get_offset - get current and max channel buffer offsets
+ *     @rchan: the channel
+ *     @max_offset: maximum channel offset
+ *
+ *     Returns the current and maximum channel buffer offsets.
+ */
+u32
+relay_get_offset(struct rchan *rchan, u32 *max_offset)
+{
+       return rchan->relay_ops->get_offset(rchan, max_offset);
+}
+
+/**
+ *     reset_index - try once to reset the current channel index
+ *     @rchan: the channel
+ *     @old_index: the index read before reset
+ *
+ *     Attempts to reset the channel index to 0.  It tries once, and
+ *     if it fails, returns negative, 0 otherwise.
+ */
+int
+reset_index(struct rchan *rchan, u32 old_index)
+{
+       return rchan->relay_ops->reset_index(rchan, old_index);
+}
+
+/*
+ * close() vm_op implementation for relayfs file mapping.
+ */
+static void
+relay_file_mmap_close(struct vm_area_struct *vma)
+{
+       struct file *filp = vma->vm_file;
+       struct rchan_reader *reader;
+       struct rchan *rchan;
+
+       reader = (struct rchan_reader *)filp->private_data;
+       rchan = reader->rchan;
+
+       atomic_dec(&rchan->mapped);
+
+       rchan->callbacks->fileop_notify(reader->rchan->id, filp,
+                                       RELAY_FILE_UNMAP);
+}
+
+/*
+ * vm_ops for relay file mappings.
+ */
+static struct vm_operations_struct relay_file_mmap_ops = {
+       .close = relay_file_mmap_close
+};
+
+/* \begin{Code inspired from BTTV driver} */
+static inline unsigned long 
+kvirt_to_pa(unsigned long adr)
+{
+       unsigned long kva, ret;
+
+       kva = (unsigned long) page_address(vmalloc_to_page((void *) adr));
+       kva |= adr & (PAGE_SIZE - 1);
+       ret = __pa(kva);
+       return ret;
+}
+
+static int
+relay_mmap_region(struct vm_area_struct *vma,
+                 const char *adr,
+                 const char *start_pos,
+                 unsigned long size)
+{
+       unsigned long start = (unsigned long) adr;
+       unsigned long page, pos;
+
+       pos = (unsigned long) start_pos;
+
+       while (size > 0) {
+               page = kvirt_to_pa(pos);
+               if (remap_page_range(vma, start, page, PAGE_SIZE, PAGE_SHARED))
+                       return -EAGAIN;
+               start += PAGE_SIZE;
+               pos += PAGE_SIZE;
+               size -= PAGE_SIZE;
+       }
+
+       return 0;
+}
+/* \end{Code inspired from BTTV driver} */
+
+/**
+ *     relay_mmap_buffer: - mmap buffer to process address space
+ *     @rchan_id: relay channel id
+ *     @vma: vm_area_struct describing memory to be mapped
+ *
+ *     Returns:
+ *     0 if ok
+ *     -EAGAIN, when remap failed
+ *     -EINVAL, invalid requested length
+ *
+ *     Caller should already have grabbed mmap_sem.
+ */
+int 
+__relay_mmap_buffer(struct rchan *rchan,
+                   struct vm_area_struct *vma)
+{
+       int err = 0;
+       unsigned long length = vma->vm_end - vma->vm_start;
+       struct file *filp = vma->vm_file;
+
+       if (rchan == NULL) {
+               err = -EBADF;
+               goto exit;
+       }
+
+       if (rchan->init_buf) {
+               err = -EPERM;
+               goto exit;
+       }
+       
+       if (length != (unsigned long)rchan->alloc_size) {
+               err = -EINVAL;
+               goto exit;
+       }
+
+       err = relay_mmap_region(vma,
+                               (char *)vma->vm_start,
+                               rchan->buf,
+                               rchan->alloc_size);
+
+       if (err == 0) {
+               vma->vm_ops = &relay_file_mmap_ops;
+               err = rchan->callbacks->fileop_notify(rchan->id, filp,
+                                                     RELAY_FILE_MAP);
+               if (err == 0)
+                       atomic_inc(&rchan->mapped);
+       }
+exit:  
+       return err;
+}
+
+/*
+ * High-level relayfs kernel API.  See Documentation/filesystems/relafys.txt.
+ */
+
+/*
+ * rchan_callback implementations defining default channel behavior.  Used
+ * in place of corresponding NULL values in client callback struct.
+ */
+
+/*
+ * buffer_end() default callback.  Does nothing.
+ */
+static int 
+buffer_end_default_callback(int rchan_id,
+                           char *current_write_pos,
+                           char *end_of_buffer,
+                           struct timeval end_time,
+                           u32 end_tsc,
+                           int using_tsc) 
+{
+       return 0;
+}
+
+/*
+ * buffer_start() default callback.  Does nothing.
+ */
+static int 
+buffer_start_default_callback(int rchan_id,
+                             char *current_write_pos,
+                             u32 buffer_id,
+                             struct timeval start_time,
+                             u32 start_tsc,
+                             int using_tsc)
+{
+       return 0;
+}
+
+/*
+ * deliver() default callback.  Does nothing.
+ */
+static void 
+deliver_default_callback(int rchan_id, char *from, u32 len)
+{
+}
+
+/*
+ * user_deliver() default callback.  Does nothing.
+ */
+static void 
+user_deliver_default_callback(int rchan_id, char *from, u32 len)
+{
+}
+
+/*
+ * needs_resize() default callback.  Does nothing.
+ */
+static void
+needs_resize_default_callback(int rchan_id,
+                             int resize_type,
+                             u32 suggested_buf_size,
+                             u32 suggested_n_bufs)
+{
+}
+
+/*
+ * fileop_notify() default callback.  Does nothing.
+ */
+static int
+fileop_notify_default_callback(int rchan_id,
+                              struct file *filp,
+                              enum relay_fileop fileop)
+{
+       return 0;
+}
+
+/*
+ * ioctl() default callback.  Does nothing.
+ */
+static int
+ioctl_default_callback(int rchan_id,
+                      unsigned int cmd,
+                      unsigned long arg)
+{
+       return 0;
+}
+
+/* relay channel default callbacks */
+static struct rchan_callbacks default_channel_callbacks = {
+       .buffer_start = buffer_start_default_callback,
+       .buffer_end = buffer_end_default_callback,
+       .deliver = deliver_default_callback,
+       .user_deliver = user_deliver_default_callback,
+       .needs_resize = needs_resize_default_callback,
+       .fileop_notify = fileop_notify_default_callback,
+       .ioctl = ioctl_default_callback,
+};
+
+/**
+ *     check_attribute_flags - check sanity of channel attributes
+ *     @flags: channel attributes
+ *     @resizeable: 1 if true
+ *
+ *     Returns 0 if successful, negative otherwise.
+ */
+static int
+check_attribute_flags(u32 *attribute_flags, int resizeable)
+{
+       u32 flags = *attribute_flags;
+       
+       if (!(flags & RELAY_DELIVERY_BULK) && !(flags & RELAY_DELIVERY_PACKET))
+               return -EINVAL; /* Delivery mode must be specified */
+       
+       if (!(flags & RELAY_USAGE_SMP) && !(flags & RELAY_USAGE_GLOBAL))
+               return -EINVAL; /* Usage must be specified */
+       
+       if (resizeable) {  /* Resizeable can never be continuous */
+               *attribute_flags &= ~RELAY_MODE_CONTINUOUS;
+               *attribute_flags |= RELAY_MODE_NO_OVERWRITE;
+       }
+       
+       if ((flags & RELAY_MODE_CONTINUOUS) &&
+           (flags & RELAY_MODE_NO_OVERWRITE))
+               return -EINVAL; /* Can't have it both ways */
+       
+       if (!(flags & RELAY_MODE_CONTINUOUS) &&
+           !(flags & RELAY_MODE_NO_OVERWRITE))
+               *attribute_flags |= RELAY_MODE_CONTINUOUS; /* Default to continuous */
+       
+       if (!(flags & RELAY_SCHEME_ANY))
+               return -EINVAL; /* One or both must be specified */
+       else if (flags & RELAY_SCHEME_LOCKLESS) {
+               if (have_cmpxchg())
+                       *attribute_flags &= ~RELAY_SCHEME_LOCKING;
+               else if (flags & RELAY_SCHEME_LOCKING)
+                       *attribute_flags &= ~RELAY_SCHEME_LOCKLESS;
+               else
+                       return -EINVAL; /* Locking scheme not an alternative */
+       }
+       
+       if (!(flags & RELAY_TIMESTAMP_ANY))
+               return -EINVAL; /* One or both must be specified */
+       else if (flags & RELAY_TIMESTAMP_TSC) {
+               if (have_tsc())
+                       *attribute_flags &= ~RELAY_TIMESTAMP_GETTIMEOFDAY;
+               else if (flags & RELAY_TIMESTAMP_GETTIMEOFDAY)
+                       *attribute_flags &= ~RELAY_TIMESTAMP_TSC;
+               else
+                       return -EINVAL; /* gettimeofday not an alternative */
+       }
+
+       return 0;
+}
+
+/*
+ * High-level API functions.
+ */
+
+/**
+ *     __relay_reset - internal reset function
+ *     @rchan: the channel
+ *     @init: 1 if this is a first-time channel initialization
+ *
+ *     See relay_reset for description of effect.
+ */
+void
+__relay_reset(struct rchan *rchan, int init)
+{
+       int i;
+       
+       if (init) {
+               rchan->version = RELAYFS_CHANNEL_VERSION;
+               init_MUTEX(&rchan->resize_sem);
+               init_waitqueue_head(&rchan->read_wait);
+               init_waitqueue_head(&rchan->write_wait);
+               atomic_set(&rchan->refcount, 0);
+               INIT_LIST_HEAD(&rchan->open_readers);
+               rchan->open_readers_lock = RW_LOCK_UNLOCKED;
+       }
+       
+       rchan->buf_id = rchan->buf_idx = 0;
+       atomic_set(&rchan->suspended, 0);
+       atomic_set(&rchan->mapped, 0);
+       rchan->half_switch = 0;
+       rchan->bufs_produced = 0;
+       rchan->bufs_consumed = 0;
+       rchan->bytes_consumed = 0;
+       rchan->initialized = 0;
+       rchan->finalized = 0;
+       rchan->resize_min = rchan->resize_max = 0;
+       rchan->resizing = 0;
+       rchan->replace_buffer = 0;
+       rchan->resize_buf = NULL;
+       rchan->resize_buf_size = 0;
+       rchan->resize_alloc_size = 0;
+       rchan->resize_n_bufs = 0;
+       rchan->resize_err = 0;
+       rchan->resize_failures = 0;
+       rchan->resize_order = 0;
+
+       rchan->expand_page_array = NULL;
+       rchan->expand_page_count = 0;
+       rchan->shrink_page_array = NULL;
+       rchan->shrink_page_count = 0;
+       rchan->resize_page_array = NULL;
+       rchan->resize_page_count = 0;
+       rchan->old_buf_page_array = NULL;
+       rchan->expand_buf_id = 0;
+
+       INIT_WORK(&rchan->wake_readers, NULL, NULL);
+       INIT_WORK(&rchan->wake_writers, NULL, NULL);
+
+       for (i = 0; i < RELAY_MAX_BUFS; i++)
+               rchan->unused_bytes[i] = 0;
+       
+       rchan->relay_ops->reset(rchan, init);
+}
+
+/**
+ *     relay_reset - reset the channel
+ *     @rchan: the channel
+ *
+ *     Returns 0 if successful, negative if not.
+ *
+ *     This has the effect of erasing all data from the buffer and
+ *     restarting the channel in its initial state.  The buffer itself
+ *     is not freed, so any mappings are still in effect.
+ *
+ *     NOTE: Care should be taken that the channnel isn't actually
+ *     being used by anything when this call is made.
+ */
+int
+relay_reset(int rchan_id)
+{
+       struct rchan *rchan;
+
+       rchan = rchan_get(rchan_id);
+       if (rchan == NULL)
+               return -EBADF;
+
+       __relay_reset(rchan, 0);
+       update_readers_consumed(rchan, 0, 0);
+
+       rchan_put(rchan);
+
+       return 0;
+}
+
+/**
+ *     check_init_buf - check the sanity of init_buf, if present
+ *     @init_buf: the initbuf
+ *     @init_buf_size: the total initbuf size
+ *     @bufsize: the channel's sub-buffer size
+ *     @nbufs: the number of sub-buffers in the channel
+ *
+ *     Returns 0 if ok, negative otherwise.
+ */
+static int
+check_init_buf(char *init_buf, u32 init_buf_size, u32 bufsize, u32 nbufs)
+{
+       int err = 0;
+       
+       if (init_buf && nbufs == 1) /* 1 sub-buffer makes no sense */
+               err = -EINVAL;
+
+       if (init_buf && (bufsize * nbufs != init_buf_size))
+               err = -EINVAL;
+
+       return err;
+}
+
+/**
+ *     rchan_create_buf - allocate the initial channel buffer
+ *     @rchan: the channel
+ *     @size_alloc: the total size of the channel buffer
+ *
+ *     Returns 0 if successful, negative otherwise.
+ */
+static inline int
+rchan_create_buf(struct rchan *rchan, int size_alloc)
+{
+       struct page **page_array;
+       int page_count;
+
+       if ((rchan->buf = (char *)alloc_rchan_buf(size_alloc, &page_array, &page_count)) == NULL) {
+               rchan->buf_page_array = NULL;
+               rchan->buf_page_count = 0;
+               return -ENOMEM;
+       }
+
+       rchan->buf_page_array = page_array;
+       rchan->buf_page_count = page_count;
+
+       return 0;
+}
+
+/**
+ *     rchan_create - allocate and initialize a channel, including buffer
+ *     @chanpath: path specifying the relayfs channel file to create
+ *     @bufsize: the size of the sub-buffers within the channel buffer
+ *     @nbufs: the number of sub-buffers within the channel buffer
+ *     @rchan_flags: flags specifying buffer attributes
+ *     @err: err code
+ *
+ *     Returns channel if successful, NULL otherwise, err receives errcode.
+ *
+ *     Allocates a struct rchan representing a relay channel, according
+ *     to the attributes passed in via rchan_flags.  Does some basic sanity
+ *     checking but doesn't try to do anything smart.  In particular, the
+ *     number of buffers must be a power of 2, and if the lockless scheme
+ *     is being used, the sub-buffer size must also be a power of 2.  The
+ *     locking scheme can use buffers of any size.
+ */
+static struct rchan *
+rchan_create(const char *chanpath, 
+            int bufsize, 
+            int nbufs, 
+            u32 rchan_flags,
+            char *init_buf,
+            u32 init_buf_size,
+            int *err)
+{
+       int size_alloc;
+       struct rchan *rchan = NULL;
+
+       *err = 0;
+
+       rchan = (struct rchan *)kmalloc(sizeof(struct rchan), GFP_KERNEL);
+       if (rchan == NULL) {
+               *err = -ENOMEM;
+               return NULL;
+       }
+       rchan->buf = rchan->init_buf = NULL;
+
+       *err = check_init_buf(init_buf, init_buf_size, bufsize, nbufs);
+       if (*err)
+               goto exit;
+       
+       if (nbufs == 1 && bufsize) {
+               rchan->n_bufs = nbufs;
+               rchan->buf_size = bufsize;
+               size_alloc = bufsize;
+               goto alloc;
+       }
+       
+       if (bufsize <= 0 ||
+           (rchan_flags & RELAY_SCHEME_LOCKLESS && hweight32(bufsize) != 1) ||
+           hweight32(nbufs) != 1 ||
+           nbufs < RELAY_MIN_BUFS ||
+           nbufs > RELAY_MAX_BUFS) {
+               *err = -EINVAL;
+               goto exit;
+       }
+
+       size_alloc = FIX_SIZE(bufsize * nbufs);
+       if (size_alloc > RELAY_MAX_BUF_SIZE) {
+               *err = -EINVAL;
+               goto exit;
+       }
+       rchan->n_bufs = nbufs;
+       rchan->buf_size = bufsize;
+
+       if (rchan_flags & RELAY_SCHEME_LOCKLESS) {
+               offset_bits(rchan) = ffs(bufsize) - 1;
+               offset_mask(rchan) =  RELAY_BUF_OFFSET_MASK(offset_bits(rchan));
+               bufno_bits(rchan) = ffs(nbufs) - 1;
+       }
+alloc:
+       if (rchan_alloc_id(rchan) == -1) {
+               *err = -ENOMEM;
+               goto exit;
+       }
+
+       if (init_buf == NULL) {
+               *err = rchan_create_buf(rchan, size_alloc);
+               if (*err) {
+                       rchan_free_id(rchan->id);
+                       goto exit;
+               }
+       } else
+               rchan->buf = rchan->init_buf = init_buf;
+       
+       rchan->alloc_size = size_alloc;
+
+       if (rchan_flags & RELAY_SCHEME_LOCKLESS)
+               rchan->relay_ops = &lockless_ops;
+       else
+               rchan->relay_ops = &locking_ops;
+
+exit:
+       if (*err) {
+               kfree(rchan);
+               rchan = NULL;
+       }
+
+       return rchan;
+}
+
+
+static char tmpname[NAME_MAX];
+
+/**
+ *     rchan_create_dir - create directory for file
+ *     @chanpath: path to file, including filename
+ *     @residual: filename remaining after parse
+ *     @topdir: the directory filename should be created in
+ *
+ *     Returns 0 if successful, negative otherwise.
+ *
+ *     Inspired by xlate_proc_name() in procfs.  Given a file path which
+ *     includes the filename, creates any and all directories necessary 
+ *     to create the file.
+ */
+static int 
+rchan_create_dir(const char * chanpath, 
+                const char **residual, 
+                struct dentry **topdir)
+{
+       const char *cp = chanpath, *next;
+       struct dentry *parent = NULL;
+       int len, err = 0;
+       
+       while (1) {
+               next = strchr(cp, '/');
+               if (!next)
+                       break;
+
+               len = next - cp;
+
+               strncpy(tmpname, cp, len);
+               tmpname[len] = '\0';
+               err = relayfs_create_dir(tmpname, parent, &parent);
+               if (err && (err != -EEXIST))
+                       return err;
+               cp += len + 1;
+       }
+
+       *residual = cp;
+       *topdir = parent;
+
+       return err;
+}
+
+/**
+ *     rchan_create_file - create file, including parent directories
+ *     @chanpath: path to file, including filename
+ *     @dentry: result dentry
+ *     @data: data to associate with the file
+ *
+ *     Returns 0 if successful, negative otherwise.
+ */
+static int 
+rchan_create_file(const char * chanpath, 
+                 struct dentry **dentry, 
+                 struct rchan * data,
+                 int mode)
+{
+       int err;
+       const char * fname;
+       struct dentry *topdir;
+
+       err = rchan_create_dir(chanpath, &fname, &topdir);
+       if (err && (err != -EEXIST))
+               return err;
+
+       err = relayfs_create_file(fname, topdir, dentry, (void *)data, mode);
+
+       return err;
+}
+
+/**
+ *     relay_open - create a new file/channel buffer in relayfs
+ *     @chanpath: name of file to create, including path
+ *     @bufsize: size of sub-buffers
+ *     @nbufs: number of sub-buffers
+ *     @flags: channel attributes
+ *     @callbacks: client callback functions
+ *     @start_reserve: number of bytes to reserve at start of each sub-buffer
+ *     @end_reserve: number of bytes to reserve at end of each sub-buffer
+ *     @rchan_start_reserve: additional reserve at start of first sub-buffer
+ *     @resize_min: minimum total buffer size, if set
+ *     @resize_max: maximum total buffer size, if set
+ *     @mode: the perms to be given to the relayfs file, 0 to accept defaults
+ *     @init_buf: initial memory buffer to start out with, NULL if N/A
+ *     @init_buf_size: initial memory buffer size to start out with, 0 if N/A
+ *
+ *     Returns channel id if successful, negative otherwise.
+ *
+ *     Creates a relay channel using the sizes and attributes specified.
+ *     The default permissions, used if mode == 0 are S_IRUSR | S_IWUSR.  See
+ *     Documentation/filesystems/relayfs.txt for details.
+ */
+int
+relay_open(const char *chanpath,
+          int bufsize,
+          int nbufs,
+          u32 flags,
+          struct rchan_callbacks *channel_callbacks,
+          u32 start_reserve,
+          u32 end_reserve,
+          u32 rchan_start_reserve,
+          u32 resize_min,
+          u32 resize_max,
+          int mode,
+          char *init_buf,
+          u32 init_buf_size)
+{
+       int err;
+       struct rchan *rchan;
+       struct dentry *dentry;
+       struct rchan_callbacks *callbacks = NULL;
+
+       if (chanpath == NULL)
+               return -EINVAL;
+
+       if (nbufs != 1) {
+               err = check_attribute_flags(&flags, resize_min ? 1 : 0);
+               if (err)
+                       return err;
+       }
+
+       rchan = rchan_create(chanpath, bufsize, nbufs, flags, init_buf, init_buf_size, &err);
+
+       if (err < 0)
+               return err;
+
+       /* Create file in fs */
+       if ((err = rchan_create_file(chanpath, &dentry, rchan, mode)) < 0) {
+               rchan_destroy_buf(rchan);
+               rchan_free_id(rchan->id);
+               kfree(rchan);
+               return err;
+       }
+
+       rchan->dentry = dentry;
+
+       if (channel_callbacks == NULL)
+               callbacks = &default_channel_callbacks;
+       else
+               callbacks = channel_callbacks;
+
+       if (callbacks->buffer_end == NULL)
+               callbacks->buffer_end = buffer_end_default_callback;
+       if (callbacks->buffer_start == NULL)
+               callbacks->buffer_start = buffer_start_default_callback;
+       if (callbacks->deliver == NULL)
+               callbacks->deliver = deliver_default_callback;
+       if (callbacks->user_deliver == NULL)
+               callbacks->user_deliver = user_deliver_default_callback;
+       if (callbacks->needs_resize == NULL)
+               callbacks->needs_resize = needs_resize_default_callback;
+       if (callbacks->fileop_notify == NULL)
+               callbacks->fileop_notify = fileop_notify_default_callback;
+       if (callbacks->ioctl == NULL)
+               callbacks->ioctl = ioctl_default_callback;
+       rchan->callbacks = callbacks;
+
+       /* Just to let the client know the sizes used */
+       rchan->callbacks->needs_resize(rchan->id,
+                                      RELAY_RESIZE_REPLACED,
+                                      rchan->buf_size,
+                                      rchan->n_bufs);
+
+       rchan->flags = flags;
+       rchan->start_reserve = start_reserve;
+       rchan->end_reserve = end_reserve;
+       rchan->rchan_start_reserve = rchan_start_reserve;
+
+       __relay_reset(rchan, 1);
+
+       if (resize_min > 0 && resize_max > 0 && 
+          resize_max < RELAY_MAX_TOTAL_BUF_SIZE) {
+               rchan->resize_min = resize_min;
+               rchan->resize_max = resize_max;
+               init_shrink_timer(rchan);
+       }
+
+       rchan_get(rchan->id);
+
+       return rchan->id;
+}
+
+/**
+ *     relay_discard_init_buf - alloc channel buffer and copy init_buf into it
+ *     @rchan_id: the channel id
+ *
+ *     Returns 0 if successful, negative otherwise.
+ *
+ *     NOTE: May sleep.  Should also be called only when the channel isn't
+ *     actively being written into.
+ */
+int
+relay_discard_init_buf(int rchan_id)
+{
+       struct rchan *rchan;
+       int err = 0;
+       
+       rchan = rchan_get(rchan_id);
+       if (rchan == NULL)
+               return -EBADF;
+
+       if (rchan->init_buf == NULL) {
+               err = -EINVAL;
+               goto out;
+       }
+       
+       err = rchan_create_buf(rchan, rchan->alloc_size);
+       if (err)
+               goto out;
+       
+       memcpy(rchan->buf, rchan->init_buf, rchan->n_bufs * rchan->buf_size);
+       rchan->init_buf = NULL;
+out:
+       rchan_put(rchan);
+       
+       return err;
+}
+
+/**
+ *     relay_finalize - perform end-of-buffer processing for last buffer
+ *     @rchan_id: the channel id
+ *     @releasing: true if called when releasing file
+ *
+ *     Returns 0 if successful, negative otherwise.
+ */
+static int 
+relay_finalize(int rchan_id)
+{
+       struct rchan *rchan = rchan_get(rchan_id);
+       if (rchan == NULL)
+               return -EBADF;
+
+       if (rchan->finalized == 0) {
+               rchan->relay_ops->finalize(rchan);
+               rchan->finalized = 1;
+       }
+
+       if (waitqueue_active(&rchan->read_wait)) {
+               PREPARE_WORK(&rchan->wake_readers, wakeup_readers, rchan);
+               schedule_delayed_work(&rchan->wake_readers, 1);
+       }
+
+       rchan_put(rchan);
+
+       return 0;
+}
+
+/**
+ *     restore_callbacks - restore default channel callbacks
+ *     @rchan: the channel
+ *
+ *     Restore callbacks to the default versions.
+ */
+static inline void
+restore_callbacks(struct rchan *rchan)
+{
+       if (rchan->callbacks != &default_channel_callbacks)
+               rchan->callbacks = &default_channel_callbacks;
+}
+
+/**
+ *     relay_close - close the channel
+ *     @rchan_id: relay channel id
+ *     
+ *     Finalizes the last sub-buffer and marks the channel as finalized.
+ *     The channel buffer and channel data structure are then freed
+ *     automatically when the last reference to the channel is given up.
+ */
+int 
+relay_close(int rchan_id)
+{
+       int err;
+       struct rchan *rchan;
+
+       if ((rchan_id < 0) || (rchan_id >= RELAY_MAX_CHANNELS))
+               return -EBADF;
+
+       err = relay_finalize(rchan_id);
+
+       if (!err) {
+               read_lock(&rchan_table_lock);
+               rchan = rchan_table[rchan_id];
+               read_unlock(&rchan_table_lock);
+
+               if (rchan) {
+                       restore_callbacks(rchan);
+                       if (rchan->resize_min)
+                               del_timer(&rchan->shrink_timer);
+                       rchan_put(rchan);
+               }
+       }
+       
+       return err;
+}
+
+/**
+ *     relay_write - reserve a slot in the channel and write data into it
+ *     @rchan_id: relay channel id
+ *     @data_ptr: data to be written into reserved slot
+ *     @count: number of bytes to write
+ *     @td_offset: optional offset where time delta should be written
+ *     @wrote_pos: optional ptr returning buf pos written to, ignored if NULL 
+ *
+ *     Returns the number of bytes written, 0 or negative on failure.
+ *
+ *     Reserves space in the channel and writes count bytes of data_ptr
+ *     to it.  Automatically performs any necessary locking, depending
+ *     on the scheme and SMP usage in effect (no locking is done for the
+ *     lockless scheme regardless of usage). 
+ *
+ *     If td_offset is >= 0, the internal time delta calculated when
+ *     slot was reserved will be written at that offset.
+ *
+ *     If wrote_pos is non-NULL, it will receive the location the data
+ *     was written to, which may be needed for some applications but is not
+ *     normally interesting.
+ */
+int
+relay_write(int rchan_id, 
+           const void *data_ptr, 
+           size_t count,
+           int td_offset,
+           void **wrote_pos)
+{
+       unsigned long flags;
+       char *reserved, *write_pos;
+       int bytes_written = 0;
+       int reserve_code, interrupting;
+       struct timeval ts;
+       u32 td;
+       struct rchan *rchan;
+       
+       rchan = rchan_get(rchan_id);
+       if (rchan == NULL)
+               return -EBADF;
+
+       relay_lock_channel(rchan, flags); /* nop for lockless */
+
+       write_pos = reserved = relay_reserve(rchan, count, &ts, &td, 
+                                            &reserve_code, &interrupting);
+
+       if (reserved != NULL) {
+               relay_write_direct(write_pos, data_ptr, count);
+               if ((td_offset >= 0) && (td_offset < count - sizeof(td)))
+                       *((u32 *)(reserved + td_offset)) = td;
+               bytes_written = count;
+       } else if (reserve_code == RELAY_WRITE_TOO_LONG)
+               bytes_written = -EINVAL;
+
+       if (bytes_written > 0)
+               relay_commit(rchan, reserved, bytes_written, reserve_code, interrupting);
+
+       relay_unlock_channel(rchan, flags); /* nop for lockless */
+
+       rchan_put(rchan);
+
+       if (wrote_pos)
+               *wrote_pos = reserved;
+       
+       return bytes_written;
+}
+
+/**
+ *     wakeup_writers - wake up VFS writers waiting on a channel
+ *     @private: the channel
+ *
+ *     This is the work function used to defer writer waking.  The
+ *     reason waking is deferred is that calling directly from 
+ *     buffers_consumed causes problems if you're writing from say 
+ *     the scheduler.
+ */
+static void 
+wakeup_writers(void *private)
+{
+       struct rchan *rchan = (struct rchan *)private;
+       
+       wake_up_interruptible(&rchan->write_wait);
+}
+
+
+/**
+ *     __relay_buffers_consumed - internal version of relay_buffers_consumed
+ *     @rchan: the relay channel
+ *     @bufs_consumed: number of buffers to add to current count for channel
+ *     
+ *     Internal - updates the channel's consumed buffer count.
+ */
+static void
+__relay_buffers_consumed(struct rchan *rchan, u32 bufs_consumed)
+{
+       rchan->bufs_consumed += bufs_consumed;
+       
+       if (rchan->bufs_consumed > rchan->bufs_produced)
+               rchan->bufs_consumed = rchan->bufs_produced;
+       
+       atomic_set(&rchan->suspended, 0);
+
+       PREPARE_WORK(&rchan->wake_writers, wakeup_writers, rchan);
+       schedule_delayed_work(&rchan->wake_writers, 1);
+}
+
+/**
+ *     __reader_buffers_consumed - update reader/channel consumed buffer count
+ *     @reader: channel reader
+ *     @bufs_consumed: number of buffers to add to current count for channel
+ *     
+ *     Internal - updates the reader's consumed buffer count.  If the reader's
+ *     resulting total is greater than the channel's, update the channel's.
+*/
+static void
+__reader_buffers_consumed(struct rchan_reader *reader, u32 bufs_consumed)
+{
+       reader->bufs_consumed += bufs_consumed;
+       
+       if (reader->bufs_consumed > reader->rchan->bufs_consumed)
+               __relay_buffers_consumed(reader->rchan, bufs_consumed);
+}
+
+/**
+ *     relay_buffers_consumed - add to the # buffers consumed for the channel
+ *     @reader: channel reader
+ *     @bufs_consumed: number of buffers to add to current count for channel
+ *     
+ *     Adds to the channel's consumed buffer count.  buffers_consumed should
+ *     be the number of buffers newly consumed, not the total number consumed.
+ *
+ *     NOTE: kernel clients don't need to call this function if the reader
+ *     is auto-consuming or the channel is MODE_CONTINUOUS.
+ */
+void 
+relay_buffers_consumed(struct rchan_reader *reader, u32 bufs_consumed)
+{
+       if (reader && reader->rchan)
+               __reader_buffers_consumed(reader, bufs_consumed);
+}
+
+/**
+ *     __relay_bytes_consumed - internal version of relay_bytes_consumed 
+ *     @rchan: the relay channel
+ *     @bytes_consumed: number of bytes to add to current count for channel
+ *     @read_offset: where the bytes were consumed from
+ *     
+ *     Internal - updates the channel's consumed count.
+*/
+static void
+__relay_bytes_consumed(struct rchan *rchan, u32 bytes_consumed, u32 read_offset)
+{
+       u32 consuming_idx;
+       u32 unused;
+
+       consuming_idx = read_offset / rchan->buf_size;
+
+       if (consuming_idx >= rchan->n_bufs)
+               consuming_idx = rchan->n_bufs - 1;
+       rchan->bytes_consumed += bytes_consumed;
+
+       unused = rchan->unused_bytes[consuming_idx];
+       
+       if (rchan->bytes_consumed + unused >= rchan->buf_size) {
+               __relay_buffers_consumed(rchan, 1);
+               rchan->bytes_consumed = 0;
+       }
+}
+
+/**
+ *     __reader_bytes_consumed - update reader/channel consumed count
+ *     @reader: channel reader
+ *     @bytes_consumed: number of bytes to add to current count for channel
+ *     @read_offset: where the bytes were consumed from
+ *     
+ *     Internal - updates the reader's consumed count.  If the reader's
+ *     resulting total is greater than the channel's, update the channel's.
+*/
+static void
+__reader_bytes_consumed(struct rchan_reader *reader, u32 bytes_consumed, u32 read_offset)
+{
+       u32 consuming_idx;
+       u32 unused;
+
+       consuming_idx = read_offset / reader->rchan->buf_size;
+
+       if (consuming_idx >= reader->rchan->n_bufs)
+               consuming_idx = reader->rchan->n_bufs - 1;
+
+       reader->bytes_consumed += bytes_consumed;
+       
+       unused = reader->rchan->unused_bytes[consuming_idx];
+       
+       if (reader->bytes_consumed + unused >= reader->rchan->buf_size) {
+               reader->bufs_consumed++;
+               reader->bytes_consumed = 0;
+       }
+
+       if ((reader->bufs_consumed > reader->rchan->bufs_consumed) ||
+           ((reader->bufs_consumed == reader->rchan->bufs_consumed) &&
+            (reader->bytes_consumed > reader->rchan->bytes_consumed)))
+               __relay_bytes_consumed(reader->rchan, bytes_consumed, read_offset);
+}
+
+/**
+ *     relay_bytes_consumed - add to the # bytes consumed for the channel
+ *     @reader: channel reader
+ *     @bytes_consumed: number of bytes to add to current count for channel
+ *     @read_offset: where the bytes were consumed from
+ *     
+ *     Adds to the channel's consumed count.  bytes_consumed should be the
+ *     number of bytes actually read e.g. return value of relay_read() and
+ *     the read_offset should be the actual offset the bytes were read from
+ *     e.g. the actual_read_offset set by relay_read(). See
+ *     Documentation/filesystems/relayfs.txt for more details.
+ *
+ *     NOTE: kernel clients don't need to call this function if the reader
+ *     is auto-consuming or the channel is MODE_CONTINUOUS.
+ */
+void
+relay_bytes_consumed(struct rchan_reader *reader, u32 bytes_consumed, u32 read_offset)
+{
+       if (reader && reader->rchan)
+               __reader_bytes_consumed(reader, bytes_consumed, read_offset);
+}
+
+/**
+ *     update_readers_consumed - apply offset change to reader
+ *     @rchan: the channel
+ *
+ *     Apply the consumed counts to all readers open on the channel.
+ */
+void
+update_readers_consumed(struct rchan *rchan, u32 bufs_consumed, u32 bytes_consumed)
+{
+       struct list_head *p;
+       struct rchan_reader *reader;
+       
+       read_lock(&rchan->open_readers_lock);
+       list_for_each(p, &rchan->open_readers) {
+               reader = list_entry(p, struct rchan_reader, list);
+               reader->bufs_consumed = bufs_consumed;
+               reader->bytes_consumed = bytes_consumed;
+               if (reader->vfs_reader) 
+                       reader->pos.file->f_pos = 0;
+               else
+                       reader->pos.f_pos = 0;
+               reader->offset_changed = 1;
+       }
+       read_unlock(&rchan->open_readers_lock);
+}
+
+/**
+ *     do_read - utility function to do the actual read to user
+ *     @rchan: the channel
+ *     @buf: user buf to read into, NULL if just getting info
+ *     @count: bytes requested
+ *     @read_offset: offset into channel
+ *     @new_offset: new offset into channel after read
+ *     @actual_read_offset: read offset actually used
+ *
+ *     Returns the number of bytes read, 0 if none.
+ */
+static ssize_t
+do_read(struct rchan *rchan, char *buf, size_t count, u32 read_offset, u32 *new_offset, u32 *actual_read_offset)
+{
+       u32 read_bufno, cur_bufno;
+       u32 avail_offset, cur_idx, max_offset, buf_end_offset;
+       u32 avail_count, buf_size;
+       int unused_bytes = 0;
+       size_t read_count = 0;
+       u32 last_buf_byte_offset;
+
+       *actual_read_offset = read_offset;
+       
+       buf_size = rchan->buf_size;
+       if (unlikely(!buf_size)) BUG();
+
+       read_bufno = read_offset / buf_size;
+       if (unlikely(read_bufno >= RELAY_MAX_BUFS)) BUG();
+       unused_bytes = rchan->unused_bytes[read_bufno];
+
+       avail_offset = cur_idx = relay_get_offset(rchan, &max_offset);
+
+       if (cur_idx == read_offset) {
+               if (atomic_read(&rchan->suspended) == 1) {
+                       read_offset += 1;
+                       if (read_offset >= max_offset)
+                               read_offset = 0;
+                       *actual_read_offset = read_offset;
+               } else {
+                       *new_offset = read_offset;
+                       return 0;
+               }
+       } else {
+               last_buf_byte_offset = (read_bufno + 1) * buf_size - 1;
+               if (read_offset == last_buf_byte_offset) {
+                       if (unused_bytes != 1) {
+                               read_offset += 1;
+                               if (read_offset >= max_offset)
+                                       read_offset = 0;
+                               *actual_read_offset = read_offset;
+                       }
+               }
+       }
+
+       read_bufno = read_offset / buf_size;
+       if (unlikely(read_bufno >= RELAY_MAX_BUFS)) BUG();
+       unused_bytes = rchan->unused_bytes[read_bufno];
+
+       cur_bufno = cur_idx / buf_size;
+
+       buf_end_offset = (read_bufno + 1) * buf_size - unused_bytes;
+       if (avail_offset > buf_end_offset)
+               avail_offset = buf_end_offset;
+       else if (avail_offset < read_offset)
+               avail_offset = buf_end_offset;
+       avail_count = avail_offset - read_offset;
+       read_count = avail_count >= count ? count : avail_count;
+
+       if (read_count && buf != NULL)
+               if (copy_to_user(buf, rchan->buf + read_offset, read_count))
+                       return -EFAULT;
+
+       if (read_bufno == cur_bufno)
+               if (read_count && (read_offset + read_count >= buf_end_offset) && (read_offset + read_count <= cur_idx)) {
+                       *new_offset = cur_idx;
+                       return read_count;
+               }
+
+       if (read_offset + read_count + unused_bytes > max_offset)
+               *new_offset = 0;
+       else if (read_offset + read_count >= buf_end_offset)
+               *new_offset = read_offset + read_count + unused_bytes;
+       else
+               *new_offset = read_offset + read_count;
+
+       return read_count;
+}
+
+/**
+ *     __relay_read - read bytes from channel, relative to current reader pos
+ *     @reader: channel reader
+ *     @buf: user buf to read into, NULL if just getting info
+ *     @count: bytes requested
+ *     @read_offset: offset into channel
+ *     @new_offset: new offset into channel after read
+ *     @actual_read_offset: read offset actually used
+ *     @wait: if non-zero, wait for something to read
+ *
+ *     Internal - see relay_read() for details.
+ *
+ *     Returns the number of bytes read, 0 if none, negative on failure.
+ */
+static ssize_t
+__relay_read(struct rchan_reader *reader, char *buf, size_t count, u32 read_offset, u32 *new_offset, u32 *actual_read_offset, int wait)
+{
+       int err = 0;
+       size_t read_count = 0;
+       struct rchan *rchan = reader->rchan;
+
+       if (!wait && !rchan->initialized)
+               return -EAGAIN;
+
+       if (using_lockless(rchan))
+               read_offset &= idx_mask(rchan);
+
+       if (read_offset >= rchan->n_bufs * rchan->buf_size) {
+               *new_offset = 0;
+               if (!wait)
+                       return -EAGAIN;
+               else
+                       return -EINTR;
+       }
+       
+       if (buf != NULL && wait) {
+               err = wait_event_interruptible(rchan->read_wait,
+                      ((rchan->finalized == 1) ||
+                       (atomic_read(&rchan->suspended) == 1) ||
+                       (relay_get_offset(rchan, NULL) != read_offset)));
+
+               if (rchan->finalized)
+                       return 0;
+
+               if (reader->offset_changed) {
+                       reader->offset_changed = 0;
+                       return -EINTR;
+               }
+               
+               if (err)
+                       return err;
+       }
+
+       read_count = do_read(rchan, buf, count, read_offset, new_offset, actual_read_offset);
+
+       if (read_count < 0)
+               err = read_count;
+       
+       if (err)
+               return err;
+       else
+               return read_count;
+}
+
+/**
+ *     relay_read - read bytes from channel, relative to current reader pos
+ *     @reader: channel reader
+ *     @buf: user buf to read into, NULL if just getting info
+ *     @count: bytes requested
+ *     @wait: if non-zero, wait for something to read
+ *     @actual_read_offset: set read offset actually used, must not be NULL
+ *
+ *     Reads count bytes from the channel, or as much as is available within
+ *     the sub-buffer currently being read.  The read offset that will be
+ *     read from is the position contained within the reader object.  If the
+ *     wait flag is set, buf is non-NULL, and there is nothing available,
+ *     it will wait until there is.  If the wait flag is 0 and there is
+ *     nothing available, -EAGAIN is returned.  If buf is NULL, the value
+ *     returned is the number of bytes that would have been read.
+ *     actual_read_offset is the value that should be passed as the read
+ *     offset to relay_bytes_consumed, needed only if the reader is not
+ *     auto-consuming and the channel is MODE_NO_OVERWRITE, but in any case,
+ *     it must not be NULL.  See Documentation/filesystems/relayfs.txt for
+ *     more details.
+ */
+ssize_t
+relay_read(struct rchan_reader *reader, char *buf, size_t count, int wait, u32 *actual_read_offset)
+{
+       u32 new_offset;
+       u32 read_offset;
+       ssize_t read_count;
+       
+       if (reader == NULL || reader->rchan == NULL)
+               return -EBADF;
+
+       if (actual_read_offset == NULL)
+               return -EINVAL;
+
+       if (reader->vfs_reader)
+               read_offset = (u32)(reader->pos.file->f_pos);
+       else
+               read_offset = reader->pos.f_pos;
+       *actual_read_offset = read_offset;
+       
+       read_count = __relay_read(reader, buf, count, read_offset,
+                                 &new_offset, actual_read_offset, wait);
+
+       if (read_count < 0)
+               return read_count;
+
+       if (reader->vfs_reader)
+               reader->pos.file->f_pos = new_offset;
+       else
+               reader->pos.f_pos = new_offset;
+
+       if (reader->auto_consume && ((read_count) || (new_offset != read_offset)))
+               __reader_bytes_consumed(reader, read_count, *actual_read_offset);
+
+       if (read_count == 0 && !wait)
+               return -EAGAIN;
+       
+       return read_count;
+}
+
+/**
+ *     relay_bytes_avail - number of bytes available in current sub-buffer
+ *     @reader: channel reader
+ *     
+ *     Returns the number of bytes available relative to the reader's
+ *     current read position within the corresponding sub-buffer, 0 if
+ *     there is nothing available.  See Documentation/filesystems/relayfs.txt
+ *     for more details.
+ */
+ssize_t
+relay_bytes_avail(struct rchan_reader *reader)
+{
+       u32 f_pos;
+       u32 new_offset;
+       u32 actual_read_offset;
+       ssize_t bytes_read;
+       
+       if (reader == NULL || reader->rchan == NULL)
+               return -EBADF;
+       
+       if (reader->vfs_reader)
+               f_pos = (u32)reader->pos.file->f_pos;
+       else
+               f_pos = reader->pos.f_pos;
+       new_offset = f_pos;
+
+       bytes_read = __relay_read(reader, NULL, reader->rchan->buf_size,
+                                 f_pos, &new_offset, &actual_read_offset, 0);
+
+       if ((new_offset != f_pos) &&
+           ((bytes_read == -EINTR) || (bytes_read == 0)))
+               bytes_read = -EAGAIN;
+       else if ((bytes_read < 0) && (bytes_read != -EAGAIN))
+               bytes_read = 0;
+
+       return bytes_read;
+}
+
+/**
+ *     rchan_empty - boolean, is the channel empty wrt reader?
+ *     @reader: channel reader
+ *     
+ *     Returns 1 if the channel is empty, 0 otherwise.
+ */
+int
+rchan_empty(struct rchan_reader *reader)
+{
+       ssize_t avail_count;
+       u32 buffers_ready;
+       struct rchan *rchan = reader->rchan;
+       u32 cur_idx, curbuf_bytes;
+       int mapped;
+
+       if (atomic_read(&rchan->suspended) == 1)
+               return 0;
+
+       mapped = atomic_read(&rchan->mapped);
+       
+       if (mapped && bulk_delivery(rchan)) {
+               buffers_ready = rchan->bufs_produced - rchan->bufs_consumed;
+               return buffers_ready ? 0 : 1;
+       }
+
+       if (mapped && packet_delivery(rchan)) {
+               buffers_ready = rchan->bufs_produced - rchan->bufs_consumed;
+               if (buffers_ready)
+                       return 0;
+               else {
+                       cur_idx = relay_get_offset(rchan, NULL);
+                       curbuf_bytes = cur_idx % rchan->buf_size;
+                       return curbuf_bytes == rchan->bytes_consumed ? 1 : 0;
+               }
+       }
+
+       avail_count = relay_bytes_avail(reader);
+
+       return avail_count ? 0 : 1;
+}
+
+/**
+ *     rchan_full - boolean, is the channel full wrt consuming reader?
+ *     @reader: channel reader
+ *     
+ *     Returns 1 if the channel is full, 0 otherwise.
+ */
+int
+rchan_full(struct rchan_reader *reader)
+{
+       u32 buffers_ready;
+       struct rchan *rchan = reader->rchan;
+
+       if (mode_continuous(rchan))
+               return 0;
+
+       buffers_ready = rchan->bufs_produced - rchan->bufs_consumed;
+
+       return buffers_ready > reader->rchan->n_bufs - 1 ? 1 : 0;
+}
+
+/**
+ *     relay_info - get status and other information about a relay channel
+ *     @rchan_id: relay channel id
+ *     @rchan_info: pointer to the rchan_info struct to be filled in
+ *     
+ *     Fills in an rchan_info struct with channel status and attribute 
+ *     information.  See Documentation/filesystems/relayfs.txt for details.
+ *
+ *     Returns 0 if successful, negative otherwise.
+ */
+int 
+relay_info(int rchan_id, struct rchan_info *rchan_info)
+{
+       int i;
+       struct rchan *rchan;
+
+       rchan = rchan_get(rchan_id);
+       if (rchan == NULL)
+               return -EBADF;
+
+       rchan_info->flags = rchan->flags;
+       rchan_info->buf_size = rchan->buf_size;
+       rchan_info->buf_addr = rchan->buf;
+       rchan_info->alloc_size = rchan->alloc_size;
+       rchan_info->n_bufs = rchan->n_bufs;
+       rchan_info->cur_idx = relay_get_offset(rchan, NULL);
+       rchan_info->bufs_produced = rchan->bufs_produced;
+       rchan_info->bufs_consumed = rchan->bufs_consumed;
+       rchan_info->buf_id = rchan->buf_id;
+
+       for (i = 0; i < rchan->n_bufs; i++) {
+               rchan_info->unused_bytes[i] = rchan->unused_bytes[i];
+               if (using_lockless(rchan))
+                       rchan_info->buffer_complete[i] = (atomic_read(&fill_count(rchan, i)) == rchan->buf_size);
+               else
+                       rchan_info->buffer_complete[i] = 0;
+       }
+
+       rchan_put(rchan);
+
+       return 0;
+}
+
+/**
+ *     __add_rchan_reader - creates and adds a reader to a channel
+ *     @rchan: relay channel
+ *     @filp: the file associated with rchan, if applicable
+ *     @auto_consume: boolean, whether reader's reads automatically consume
+ *     @map_reader: boolean, whether reader's reading via a channel mapping
+ *
+ *     Returns a pointer to the reader object create, NULL if unsuccessful
+ *
+ *     Creates and initializes an rchan_reader object for reading the channel.
+ *     If filp is non-NULL, the reader is a VFS reader, otherwise not.
+ *
+ *     If the reader is a map reader, it isn't considered a VFS reader for
+ *     our purposes.  Also, map_readers can't be auto-consuming.
+ */
+struct rchan_reader *
+__add_rchan_reader(struct rchan *rchan, struct file *filp, int auto_consume, int map_reader)
+{
+       struct rchan_reader *reader;
+       u32 will_read;
+       
+       reader = kmalloc(sizeof(struct rchan_reader), GFP_KERNEL);
+
+       if (reader) {
+               write_lock(&rchan->open_readers_lock);
+               reader->rchan = rchan;
+               if (filp) {
+                       reader->vfs_reader = 1;
+                       reader->pos.file = filp;
+               } else {
+                       reader->vfs_reader = 0;
+                       reader->pos.f_pos = 0;
+               }
+               reader->map_reader = map_reader;
+               reader->auto_consume = auto_consume;
+
+               if (!map_reader) {
+                       will_read = rchan->bufs_produced % rchan->n_bufs;
+                       if (!will_read && atomic_read(&rchan->suspended))
+                               will_read = rchan->n_bufs;
+                       reader->bufs_consumed = rchan->bufs_produced - will_read;
+                       rchan->bufs_consumed = reader->bufs_consumed;
+                       rchan->bytes_consumed = reader->bytes_consumed = 0;
+                       reader->offset_changed = 0;
+               }
+               
+               list_add(&reader->list, &rchan->open_readers);
+               write_unlock(&rchan->open_readers_lock);
+       }
+
+       return reader;
+}
+
+/**
+ *     add_rchan_reader - create a reader for a channel
+ *     @rchan_id: relay channel handle
+ *     @auto_consume: boolean, whether reader's reads automatically consume
+ *
+ *     Returns a pointer to the reader object created, NULL if unsuccessful
+ *
+ *     Creates and initializes an rchan_reader object for reading the channel.
+ *     This function is useful only for non-VFS readers.
+ */
+struct rchan_reader *
+add_rchan_reader(int rchan_id, int auto_consume)
+{
+       struct rchan *rchan = rchan_get(rchan_id);
+       if (rchan == NULL)
+               return NULL;
+
+       return __add_rchan_reader(rchan, NULL, auto_consume, 0);
+}
+
+/**
+ *     add_map_reader - create a map reader for a channel
+ *     @rchan_id: relay channel handle
+ *
+ *     Returns a pointer to the reader object created, NULL if unsuccessful
+ *
+ *     Creates and initializes an rchan_reader object for reading the channel.
+ *     This function is useful only for map readers.
+ */
+struct rchan_reader *
+add_map_reader(int rchan_id)
+{
+       struct rchan *rchan = rchan_get(rchan_id);
+       if (rchan == NULL)
+               return NULL;
+
+       return __add_rchan_reader(rchan, NULL, 0, 1);
+}
+
+/**
+ *     __remove_rchan_reader - destroy a channel reader
+ *     @reader: channel reader
+ *
+ *     Internal - removes reader from the open readers list, and frees it.
+ */
+void
+__remove_rchan_reader(struct rchan_reader *reader)
+{
+       struct list_head *p;
+       struct rchan_reader *found_reader = NULL;
+       
+       write_lock(&reader->rchan->open_readers_lock);
+       list_for_each(p, &reader->rchan->open_readers) {
+               found_reader = list_entry(p, struct rchan_reader, list);
+               if (found_reader == reader) {
+                       list_del(&found_reader->list);
+                       break;
+               }
+       }
+       write_unlock(&reader->rchan->open_readers_lock);
+
+       if (found_reader)
+               kfree(found_reader);
+}
+
+/**
+ *     remove_rchan_reader - destroy a channel reader
+ *     @reader: channel reader
+ *
+ *     Finds and removes the given reader from the channel.  This function
+ *     is useful only for non-VFS readers.
+ *
+ *     Returns 0 if successful, negative otherwise.
+ */
+int 
+remove_rchan_reader(struct rchan_reader *reader)
+{
+       int err = 0;
+       
+       if (reader) {
+               rchan_put(reader->rchan);
+               __remove_rchan_reader(reader);
+       } else
+               err = -EINVAL;
+
+       return err;
+}
+
+/**
+ *     remove_map_reader - destroy a map reader
+ *     @reader: channel reader
+ *
+ *     Finds and removes the given map reader from the channel.  This function
+ *     is useful only for map readers.
+ *
+ *     Returns 0 if successful, negative otherwise.
+ */
+int 
+remove_map_reader(struct rchan_reader *reader)
+{
+       return remove_rchan_reader(reader);
+}
+
+EXPORT_SYMBOL(relay_open);
+EXPORT_SYMBOL(relay_close);
+EXPORT_SYMBOL(relay_reset);
+EXPORT_SYMBOL(relay_reserve);
+EXPORT_SYMBOL(relay_commit);
+EXPORT_SYMBOL(relay_read);
+EXPORT_SYMBOL(relay_write);
+EXPORT_SYMBOL(relay_bytes_avail);
+EXPORT_SYMBOL(relay_buffers_consumed);
+EXPORT_SYMBOL(relay_bytes_consumed);
+EXPORT_SYMBOL(relay_info);
+EXPORT_SYMBOL(relay_discard_init_buf);
+
+
diff --git a/fs/relayfs/relay_locking.c b/fs/relayfs/relay_locking.c
new file mode 100644 (file)
index 0000000..718f149
--- /dev/null
@@ -0,0 +1,322 @@
+/*
+ * RelayFS locking scheme implementation.
+ *
+ * Copyright (C) 1999, 2000, 2001, 2002 - Karim Yaghmour (karim@opersys.com)
+ * Copyright (C) 2002, 2003 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
+ *
+ * This file is released under the GPL.
+ */
+
+#include <asm/relay.h>
+#include "relay_locking.h"
+#include "resize.h"
+
+/**
+ *     switch_buffers - switches between read and write buffers.
+ *     @cur_time: current time.
+ *     @cur_tsc: the TSC associated with current_time, if applicable
+ *     @rchan: the channel
+ *     @finalizing: if true, don't start a new buffer 
+ *     @resetting: if true, 
+ *
+ *     This should be called from with interrupts disabled.
+ */
+static void 
+switch_buffers(struct timeval cur_time,
+              u32 cur_tsc,
+              struct rchan *rchan,
+              int finalizing,
+              int resetting,
+              int finalize_buffer_only)
+{
+       char *chan_buf_end;
+       int bytes_written;
+
+       if (!rchan->half_switch) {
+               bytes_written = rchan->callbacks->buffer_end(rchan->id,
+                            cur_write_pos(rchan), write_buf_end(rchan),
+                            cur_time, cur_tsc, using_tsc(rchan));
+               if (bytes_written == 0)
+                       rchan->unused_bytes[rchan->buf_idx % rchan->n_bufs] = 
+                               write_buf_end(rchan) - cur_write_pos(rchan);
+       }
+
+       if (finalize_buffer_only) {
+               rchan->bufs_produced++;
+               return;
+       }
+       
+       chan_buf_end = rchan->buf + rchan->n_bufs * rchan->buf_size;
+       if((write_buf(rchan) + rchan->buf_size >= chan_buf_end) || resetting)
+               write_buf(rchan) = rchan->buf;
+       else
+               write_buf(rchan) += rchan->buf_size;
+       write_buf_end(rchan) = write_buf(rchan) + rchan->buf_size;
+       write_limit(rchan) = write_buf_end(rchan) - rchan->end_reserve;
+       cur_write_pos(rchan) = write_buf(rchan);
+
+       rchan->buf_start_time = cur_time;
+       rchan->buf_start_tsc = cur_tsc;
+
+       if (resetting)
+               rchan->buf_idx = 0;
+       else
+               rchan->buf_idx++;
+       rchan->buf_id++;
+
+       if (!packet_delivery(rchan))
+               rchan->unused_bytes[rchan->buf_idx % rchan->n_bufs] = 0;
+
+       if (resetting) {
+               rchan->bufs_produced = rchan->bufs_produced + rchan->n_bufs;
+               rchan->bufs_produced -= rchan->bufs_produced % rchan->n_bufs;
+               rchan->bufs_consumed = rchan->bufs_produced;
+               rchan->bytes_consumed = 0;
+               update_readers_consumed(rchan, rchan->bufs_consumed, rchan->bytes_consumed);
+       } else if (!rchan->half_switch)
+               rchan->bufs_produced++;
+
+       rchan->half_switch = 0;
+       
+       if (!finalizing) {
+               bytes_written = rchan->callbacks->buffer_start(rchan->id, cur_write_pos(rchan), rchan->buf_id, cur_time, cur_tsc, using_tsc(rchan));
+               cur_write_pos(rchan) += bytes_written;
+       }
+}
+
+/**
+ *     locking_reserve - reserve a slot in the buffer for an event.
+ *     @rchan: the channel
+ *     @slot_len: the length of the slot to reserve
+ *     @ts: variable that will receive the time the slot was reserved
+ *     @tsc: the timestamp counter associated with time
+ *     @err: receives the result flags
+ *     @interrupting: if this write is interrupting another, set to non-zero 
+ *
+ *     Returns pointer to the beginning of the reserved slot, NULL if error.
+ *
+ *     The err value contains the result flags and is an ORed combination 
+ *     of the following:
+ *
+ *     RELAY_BUFFER_SWITCH_NONE - no buffer switch occurred
+ *     RELAY_EVENT_DISCARD_NONE - event should not be discarded
+ *     RELAY_BUFFER_SWITCH - buffer switch occurred
+ *     RELAY_EVENT_DISCARD - event should be discarded (all buffers are full)
+ *     RELAY_EVENT_TOO_LONG - event won't fit into even an empty buffer
+ */
+inline char *
+locking_reserve(struct rchan *rchan,
+               u32 slot_len,
+               struct timeval *ts,
+               u32 *tsc,
+               int *err,
+               int *interrupting)
+{
+       u32 buffers_ready;
+       int bytes_written;
+
+       *err = RELAY_BUFFER_SWITCH_NONE;
+
+       if (slot_len >= rchan->buf_size) {
+               *err = RELAY_WRITE_DISCARD | RELAY_WRITE_TOO_LONG;
+               return NULL;
+       }
+
+       if (rchan->initialized == 0) {
+               rchan->initialized = 1;
+               get_timestamp(&rchan->buf_start_time, 
+                             &rchan->buf_start_tsc, rchan);
+               rchan->unused_bytes[0] = 0;
+               bytes_written = rchan->callbacks->buffer_start(
+                       rchan->id, cur_write_pos(rchan), 
+                       rchan->buf_id, rchan->buf_start_time, 
+                       rchan->buf_start_tsc, using_tsc(rchan));
+               cur_write_pos(rchan) += bytes_written;
+               *tsc = get_time_delta(ts, rchan);
+               return cur_write_pos(rchan);
+       }
+
+       *tsc = get_time_delta(ts, rchan);
+
+       if (in_progress_event_size(rchan)) {
+               interrupted_pos(rchan) = cur_write_pos(rchan);
+               cur_write_pos(rchan) = in_progress_event_pos(rchan) 
+                       + in_progress_event_size(rchan) 
+                       + interrupting_size(rchan);
+               *interrupting = 1;
+       } else {
+               in_progress_event_pos(rchan) = cur_write_pos(rchan);
+               in_progress_event_size(rchan) = slot_len;
+               interrupting_size(rchan) = 0;
+       }
+
+       if (cur_write_pos(rchan) + slot_len > write_limit(rchan)) {
+               if (atomic_read(&rchan->suspended) == 1) {
+                       in_progress_event_pos(rchan) = NULL;
+                       in_progress_event_size(rchan) = 0;
+                       interrupting_size(rchan) = 0;
+                       *err = RELAY_WRITE_DISCARD;
+                       return NULL;
+               }
+
+               buffers_ready = rchan->bufs_produced - rchan->bufs_consumed;
+               if (buffers_ready == rchan->n_bufs - 1) {
+                       if (!mode_continuous(rchan)) {
+                               atomic_set(&rchan->suspended, 1);
+                               in_progress_event_pos(rchan) = NULL;
+                               in_progress_event_size(rchan) = 0;
+                               interrupting_size(rchan) = 0;
+                               get_timestamp(ts, tsc, rchan);
+                               switch_buffers(*ts, *tsc, rchan, 0, 0, 1);
+                               recalc_time_delta(ts, tsc, rchan);
+                               rchan->half_switch = 1;
+
+                               cur_write_pos(rchan) = write_buf_end(rchan) - 1;
+                               *err = RELAY_BUFFER_SWITCH | RELAY_WRITE_DISCARD;
+                               return NULL;
+                       }
+               }
+
+               get_timestamp(ts, tsc, rchan);
+               switch_buffers(*ts, *tsc, rchan, 0, 0, 0);
+               recalc_time_delta(ts, tsc, rchan);
+               *err = RELAY_BUFFER_SWITCH;
+       }
+
+       return cur_write_pos(rchan);
+}
+
+/**
+ *     locking_commit - commit a reserved slot in the buffer
+ *     @rchan: the channel
+ *     @from: commit the length starting here
+ *     @len: length committed
+ *     @deliver: length committed
+ *     @interrupting: not used
+ *
+ *      Commits len bytes and calls deliver callback if applicable.
+ */
+inline void
+locking_commit(struct rchan *rchan,
+              char *from,
+              u32 len, 
+              int deliver, 
+              int interrupting)
+{
+       cur_write_pos(rchan) += len;
+       
+       if (interrupting) {
+               cur_write_pos(rchan) = interrupted_pos(rchan);
+               interrupting_size(rchan) += len;
+       } else {
+               in_progress_event_size(rchan) = 0;
+               if (interrupting_size(rchan)) {
+                       cur_write_pos(rchan) += interrupting_size(rchan);
+                       interrupting_size(rchan) = 0;
+               }
+       }
+
+       if (deliver) {
+               if (bulk_delivery(rchan)) {
+                       u32 cur_idx = cur_write_pos(rchan) - rchan->buf;
+                       u32 cur_bufno = cur_idx / rchan->buf_size;
+                       from = rchan->buf + cur_bufno * rchan->buf_size;
+                       len = cur_idx - cur_bufno * rchan->buf_size;
+               }
+               rchan->callbacks->deliver(rchan->id, from, len);
+               expand_check(rchan);
+       }
+}
+
+/**
+ *     locking_finalize: - finalize last buffer at end of channel use
+ *     @rchan: the channel
+ */
+inline void 
+locking_finalize(struct rchan *rchan)
+{
+       unsigned long int flags;
+       struct timeval time;
+       u32 tsc;
+
+       local_irq_save(flags);
+       get_timestamp(&time, &tsc, rchan);
+       switch_buffers(time, tsc, rchan, 1, 0, 0);
+       local_irq_restore(flags);
+}
+
+/**
+ *     locking_get_offset - get current and max 'file' offsets for VFS
+ *     @rchan: the channel
+ *     @max_offset: maximum channel offset
+ *
+ *     Returns the current and maximum buffer offsets in VFS terms.
+ */
+u32
+locking_get_offset(struct rchan *rchan,
+                  u32 *max_offset)
+{
+       if (max_offset)
+               *max_offset = rchan->buf_size * rchan->n_bufs - 1;
+
+       return cur_write_pos(rchan) - rchan->buf;
+}
+
+/**
+ *     locking_reset - reset the channel
+ *     @rchan: the channel
+ *     @init: 1 if this is a first-time channel initialization
+ */
+void locking_reset(struct rchan *rchan, int init)
+{
+       if (init)
+               channel_lock(rchan) = SPIN_LOCK_UNLOCKED;
+       write_buf(rchan) = rchan->buf;
+       write_buf_end(rchan) = write_buf(rchan) + rchan->buf_size;
+       cur_write_pos(rchan) = write_buf(rchan);
+       write_limit(rchan) = write_buf_end(rchan) - rchan->end_reserve;
+       in_progress_event_pos(rchan) = NULL;
+       in_progress_event_size(rchan) = 0;
+       interrupted_pos(rchan) = NULL;
+       interrupting_size(rchan) = 0;
+}
+
+/**
+ *     locking_reset_index - atomically set channel index to the beginning
+ *     @rchan: the channel
+ *
+ *     If this fails, it means that something else just logged something
+ *     and therefore we probably no longer want to do this.  It's up to the
+ *     caller anyway...
+ *
+ *     Returns 0 if the index was successfully set, negative otherwise
+ */
+int
+locking_reset_index(struct rchan *rchan, u32 old_idx)
+{
+       unsigned long flags;
+       struct timeval time;
+       u32 tsc;
+       u32 cur_idx;
+       
+       relay_lock_channel(rchan, flags);
+       cur_idx = locking_get_offset(rchan, NULL);
+       if (cur_idx != old_idx) {
+               relay_unlock_channel(rchan, flags);
+               return -1;
+       }
+
+       get_timestamp(&time, &tsc, rchan);
+       switch_buffers(time, tsc, rchan, 0, 1, 0);
+
+       relay_unlock_channel(rchan, flags);
+
+       return 0;
+}
+
+
+
+
+
+
+
diff --git a/fs/relayfs/relay_locking.h b/fs/relayfs/relay_locking.h
new file mode 100644 (file)
index 0000000..3dde7df
--- /dev/null
@@ -0,0 +1,34 @@
+#ifndef _RELAY_LOCKING_H
+#define _RELAY_LOCKING_H
+
+extern char *
+locking_reserve(struct rchan *rchan,
+               u32 slot_len, 
+               struct timeval *time_stamp,
+               u32 *tsc,
+               int *err,
+               int *interrupting);
+
+extern void 
+locking_commit(struct rchan *rchan,
+              char *from,
+              u32 len, 
+              int deliver, 
+              int interrupting);
+
+extern void 
+locking_resume(struct rchan *rchan);
+
+extern void 
+locking_finalize(struct rchan *rchan);
+
+extern u32 
+locking_get_offset(struct rchan *rchan, u32 *max_offset);
+
+extern void 
+locking_reset(struct rchan *rchan, int init);
+
+extern int
+locking_reset_index(struct rchan *rchan, u32 old_idx);
+
+#endif /* _RELAY_LOCKING_H */
diff --git a/fs/relayfs/relay_lockless.c b/fs/relayfs/relay_lockless.c
new file mode 100644 (file)
index 0000000..98524bf
--- /dev/null
@@ -0,0 +1,541 @@
+/*
+ * RelayFS lockless scheme implementation.
+ *
+ * Copyright (C) 1999, 2000, 2001, 2002 - Karim Yaghmour (karim@opersys.com)
+ * Copyright (C) 2002, 2003 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
+ * Copyright (C) 2002, 2003 - Bob Wisniewski (bob@watson.ibm.com), IBM Corp
+ *
+ * This file is released under the GPL.
+ */
+
+#include <asm/relay.h>
+#include "relay_lockless.h"
+#include "resize.h"
+
+/**
+ *     compare_and_store_volatile - self-explicit
+ *     @ptr: ptr to the word that will receive the new value
+ *     @oval: the value we think is currently in *ptr
+ *     @nval: the value *ptr will get if we were right
+ */
+inline int 
+compare_and_store_volatile(volatile u32 *ptr, 
+                          u32 oval,
+                          u32 nval)
+{
+       u32 prev;
+
+       barrier();
+       prev = cmpxchg(ptr, oval, nval);
+       barrier();
+
+       return (prev == oval);
+}
+
+/**
+ *     atomic_set_volatile - atomically set the value in ptr to nval.
+ *     @ptr: ptr to the word that will receive the new value
+ *     @nval: the new value
+ */
+inline void 
+atomic_set_volatile(atomic_t *ptr,
+                   u32 nval)
+{
+       barrier();
+       atomic_set(ptr, (int)nval);
+       barrier();
+}
+
+/**
+ *     atomic_add_volatile - atomically add val to the value at ptr.
+ *     @ptr: ptr to the word that will receive the addition
+ *     @val: the value to add to *ptr
+ */
+inline void 
+atomic_add_volatile(atomic_t *ptr, u32 val)
+{
+       barrier();
+       atomic_add((int)val, ptr);
+       barrier();
+}
+
+/**
+ *     atomic_sub_volatile - atomically substract val from the value at ptr.
+ *     @ptr: ptr to the word that will receive the subtraction
+ *     @val: the value to subtract from *ptr
+ */
+inline void 
+atomic_sub_volatile(atomic_t *ptr, s32 val)
+{
+       barrier();
+       atomic_sub((int)val, ptr);
+       barrier();
+}
+
+/**
+ *     lockless_commit - commit a reserved slot in the buffer
+ *     @rchan: the channel
+ *     @from: commit the length starting here
+ *     @len: length committed
+ *     @deliver: length committed
+ *     @interrupting: not used
+ *
+ *      Commits len bytes and calls deliver callback if applicable.
+ */
+inline void 
+lockless_commit(struct rchan *rchan,
+               char *from,
+               u32 len, 
+               int deliver, 
+               int interrupting)
+{
+       u32 bufno, idx;
+       
+       idx = from - rchan->buf;
+
+       if (len > 0) {
+               bufno = RELAY_BUFNO_GET(idx, offset_bits(rchan));
+               atomic_add_volatile(&fill_count(rchan, bufno), len);
+       }
+
+       if (deliver) {
+               u32 mask = offset_mask(rchan);
+               if (bulk_delivery(rchan)) {
+                       from = rchan->buf + RELAY_BUF_OFFSET_CLEAR(idx, mask);
+                       len += RELAY_BUF_OFFSET_GET(idx, mask);
+               }
+               rchan->callbacks->deliver(rchan->id, from, len);
+               expand_check(rchan);
+       }
+}
+
+/**
+ *     get_buffer_end - get the address of the end of buffer 
+ *     @rchan: the channel
+ *     @buf_idx: index into channel corresponding to address
+ */
+static inline char * 
+get_buffer_end(struct rchan *rchan, u32 buf_idx)
+{
+       return rchan->buf
+               + RELAY_BUF_OFFSET_CLEAR(buf_idx, offset_mask(rchan))
+               + RELAY_BUF_SIZE(offset_bits(rchan));
+}
+
+
+/**
+ *     finalize_buffer - utility function consolidating end-of-buffer tasks.
+ *     @rchan: the channel
+ *     @end_idx: index into buffer to write the end-buffer event at
+ *     @size_lost: number of unused bytes at the end of the buffer
+ *     @time_stamp: the time of the end-buffer event
+ *     @tsc: the timestamp counter associated with time
+ *     @resetting: are we resetting the channel?
+ *
+ *     This function must be called with local irqs disabled.
+ */
+static inline void 
+finalize_buffer(struct rchan *rchan,
+               u32 end_idx,
+               u32 size_lost, 
+               struct timeval *time_stamp,
+               u32 *tsc, 
+               int resetting)
+{
+       char* cur_write_pos;
+       char* write_buf_end;
+       u32 bufno;
+       int bytes_written;
+       
+       cur_write_pos = rchan->buf + end_idx;
+       write_buf_end = get_buffer_end(rchan, end_idx - 1);
+
+       bytes_written = rchan->callbacks->buffer_end(rchan->id, cur_write_pos, 
+                    write_buf_end, *time_stamp, *tsc, using_tsc(rchan));
+       if (bytes_written == 0)
+               rchan->unused_bytes[rchan->buf_idx % rchan->n_bufs] = size_lost;
+       
+        bufno = RELAY_BUFNO_GET(end_idx, offset_bits(rchan));
+        atomic_add_volatile(&fill_count(rchan, bufno), size_lost);
+       if (resetting) {
+               rchan->bufs_produced = rchan->bufs_produced + rchan->n_bufs;
+               rchan->bufs_produced -= rchan->bufs_produced % rchan->n_bufs;
+               rchan->bufs_consumed = rchan->bufs_produced;
+               rchan->bytes_consumed = 0;
+               update_readers_consumed(rchan, rchan->bufs_consumed, rchan->bytes_consumed);
+       } else
+               rchan->bufs_produced++;
+}
+
+/**
+ *     lockless_finalize: - finalize last buffer at end of channel use
+ *     @rchan: the channel
+ */
+inline void
+lockless_finalize(struct rchan *rchan)
+{
+       u32 event_end_idx;
+       u32 size_lost;
+       unsigned long int flags;
+       struct timeval time;
+       u32 tsc;
+
+       event_end_idx = RELAY_BUF_OFFSET_GET(idx(rchan), offset_mask(rchan));
+       size_lost = RELAY_BUF_SIZE(offset_bits(rchan)) - event_end_idx;
+
+       local_irq_save(flags);
+       get_timestamp(&time, &tsc, rchan);
+       finalize_buffer(rchan, idx(rchan) & idx_mask(rchan), size_lost, 
+                       &time, &tsc, 0);
+       local_irq_restore(flags);
+}
+
+/**
+ *     discard_check: - determine whether a write should be discarded
+ *     @rchan: the channel
+ *     @old_idx: index into buffer where check for space should begin
+ *     @write_len: the length of the write to check
+ *     @time_stamp: the time of the end-buffer event
+ *     @tsc: the timestamp counter associated with time
+ *
+ *     The return value contains the result flags and is an ORed combination 
+ *     of the following:
+ *
+ *     RELAY_WRITE_DISCARD_NONE - write should not be discarded
+ *     RELAY_BUFFER_SWITCH - buffer switch occurred
+ *     RELAY_WRITE_DISCARD - write should be discarded (all buffers are full)
+ *     RELAY_WRITE_TOO_LONG - write won't fit into even an empty buffer
+ */
+static inline int
+discard_check(struct rchan *rchan,
+             u32 old_idx,
+             u32 write_len, 
+             struct timeval *time_stamp,
+             u32 *tsc)
+{
+       u32 buffers_ready;
+       u32 offset_mask = offset_mask(rchan);
+       u8 offset_bits = offset_bits(rchan);
+       u32 idx_mask = idx_mask(rchan);
+       u32 size_lost;
+       unsigned long int flags;
+
+       if (write_len > RELAY_BUF_SIZE(offset_bits))
+               return RELAY_WRITE_DISCARD | RELAY_WRITE_TOO_LONG;
+
+       if (mode_continuous(rchan))
+               return RELAY_WRITE_DISCARD_NONE;
+       
+       local_irq_save(flags);
+       if (atomic_read(&rchan->suspended) == 1) {
+               local_irq_restore(flags);
+               return RELAY_WRITE_DISCARD;
+       }
+       if (rchan->half_switch) {
+               local_irq_restore(flags);
+               return RELAY_WRITE_DISCARD_NONE;
+       }
+       buffers_ready = rchan->bufs_produced - rchan->bufs_consumed;
+       if (buffers_ready == rchan->n_bufs - 1) {
+               atomic_set(&rchan->suspended, 1);
+               size_lost = RELAY_BUF_SIZE(offset_bits)
+                       - RELAY_BUF_OFFSET_GET(old_idx, offset_mask);
+               finalize_buffer(rchan, old_idx & idx_mask, size_lost, 
+                               time_stamp, tsc, 0);
+               rchan->half_switch = 1;
+               idx(rchan) = RELAY_BUF_OFFSET_CLEAR((old_idx & idx_mask), offset_mask(rchan)) + RELAY_BUF_SIZE(offset_bits) - 1;
+               local_irq_restore(flags);
+
+               return RELAY_BUFFER_SWITCH | RELAY_WRITE_DISCARD;
+       }
+       local_irq_restore(flags);
+
+       return RELAY_WRITE_DISCARD_NONE;
+}
+
+/**
+ *     switch_buffers - switch over to a new sub-buffer
+ *     @rchan: the channel
+ *     @slot_len: the length of the slot needed for the current write
+ *     @offset: the offset calculated for the new index
+ *     @ts: timestamp
+ *     @tsc: the timestamp counter associated with time
+ *     @old_idx: the value of the buffer control index when we were called
+ *     @old_idx: the new calculated value of the buffer control index
+ *     @resetting: are we resetting the channel?
+ */
+static inline void
+switch_buffers(struct rchan *rchan,
+              u32 slot_len,
+              u32 offset,
+              struct timeval *ts,
+              u32 *tsc,
+              u32 new_idx,
+              u32 old_idx,
+              int resetting)
+{
+       u32 size_lost = rchan->end_reserve;
+       unsigned long int flags;
+       u32 idx_mask = idx_mask(rchan);
+       u8 offset_bits = offset_bits(rchan);
+       char *cur_write_pos;
+       u32 new_buf_no;
+       u32 start_reserve = rchan->start_reserve;
+       
+       if (resetting)
+               size_lost = RELAY_BUF_SIZE(offset_bits(rchan)) - old_idx % rchan->buf_size;
+
+       if (offset > 0)
+               size_lost += slot_len - offset;
+       else
+               old_idx += slot_len;
+
+       local_irq_save(flags);
+       if (!rchan->half_switch)
+               finalize_buffer(rchan, old_idx & idx_mask, size_lost,
+                               ts, tsc, resetting);
+       rchan->half_switch = 0;
+       rchan->buf_start_time = *ts;
+       rchan->buf_start_tsc = *tsc;
+       local_irq_restore(flags);
+
+       cur_write_pos = rchan->buf + RELAY_BUF_OFFSET_CLEAR((new_idx
+                                            & idx_mask), offset_mask(rchan));
+       if (resetting)
+               rchan->buf_idx = 0;
+       else
+               rchan->buf_idx++;
+       rchan->buf_id++;
+       
+       rchan->unused_bytes[rchan->buf_idx % rchan->n_bufs] = 0;
+
+       rchan->callbacks->buffer_start(rchan->id, cur_write_pos, 
+                              rchan->buf_id, *ts, *tsc, using_tsc(rchan));
+       new_buf_no = RELAY_BUFNO_GET(new_idx & idx_mask, offset_bits);
+       atomic_sub_volatile(&fill_count(rchan, new_buf_no),
+                           RELAY_BUF_SIZE(offset_bits) - start_reserve);
+       if (atomic_read(&fill_count(rchan, new_buf_no)) < start_reserve)
+               atomic_set_volatile(&fill_count(rchan, new_buf_no), 
+                                   start_reserve);
+}
+
+/**
+ *     lockless_reserve_slow - the slow reserve path in the lockless scheme
+ *     @rchan: the channel
+ *     @slot_len: the length of the slot to reserve
+ *     @ts: variable that will receive the time the slot was reserved
+ *     @tsc: the timestamp counter associated with time
+ *     @old_idx: the value of the buffer control index when we were called
+ *     @err: receives the result flags
+ *
+ *     Returns pointer to the beginning of the reserved slot, NULL if error.
+
+ *     err values same as for lockless_reserve.
+ */
+static inline char *
+lockless_reserve_slow(struct rchan *rchan,
+                     u32 slot_len,
+                     struct timeval *ts,
+                     u32 *tsc,
+                     u32 old_idx,
+                     int *err)
+{
+       u32 new_idx, offset;
+       unsigned long int flags;
+       u32 offset_mask = offset_mask(rchan);
+       u32 idx_mask = idx_mask(rchan);
+       u32 start_reserve = rchan->start_reserve;
+       u32 end_reserve = rchan->end_reserve;
+       int discard_event;
+       u32 reserved_idx;
+       char *cur_write_pos;
+       int initializing = 0;
+
+       *err = RELAY_BUFFER_SWITCH_NONE;
+
+       discard_event = discard_check(rchan, old_idx, slot_len, ts, tsc);
+       if (discard_event != RELAY_WRITE_DISCARD_NONE) {
+               *err = discard_event;
+               return NULL;
+       }
+
+       local_irq_save(flags);
+       if (rchan->initialized == 0) {
+               rchan->initialized = initializing = 1;
+               idx(rchan) = rchan->start_reserve + rchan->rchan_start_reserve;
+       }
+       local_irq_restore(flags);
+
+       do {
+               old_idx = idx(rchan);
+               new_idx = old_idx + slot_len;
+
+               offset = RELAY_BUF_OFFSET_GET(new_idx + end_reserve,
+                                             offset_mask);
+               if ((offset < slot_len) && (offset > 0)) {
+                       reserved_idx = RELAY_BUF_OFFSET_CLEAR(new_idx 
+                               + end_reserve, offset_mask) + start_reserve;
+                       new_idx = reserved_idx + slot_len;
+               } else if (offset < slot_len) {
+                       reserved_idx = old_idx;
+                       new_idx = RELAY_BUF_OFFSET_CLEAR(new_idx
+                             + end_reserve, offset_mask) + start_reserve;
+               } else
+                       reserved_idx = old_idx;
+               get_timestamp(ts, tsc, rchan);
+       } while (!compare_and_store_volatile(&idx(rchan), old_idx, new_idx));
+
+       reserved_idx &= idx_mask;
+
+       if (initializing == 1) {
+               cur_write_pos = rchan->buf 
+                       + RELAY_BUF_OFFSET_CLEAR((old_idx & idx_mask),
+                                                offset_mask(rchan));
+               rchan->buf_start_time = *ts;
+               rchan->buf_start_tsc = *tsc;
+               rchan->unused_bytes[0] = 0;
+
+               rchan->callbacks->buffer_start(rchan->id, cur_write_pos, 
+                              rchan->buf_id, *ts, *tsc, using_tsc(rchan));
+       }
+
+       if (offset < slot_len) {
+               switch_buffers(rchan, slot_len, offset, ts, tsc, new_idx,
+                              old_idx, 0);
+               *err = RELAY_BUFFER_SWITCH;
+       }
+
+       /* If not using TSC, need to calc time delta */
+       recalc_time_delta(ts, tsc, rchan);
+
+       return rchan->buf + reserved_idx;
+}
+
+/**
+ *     lockless_reserve - reserve a slot in the buffer for an event.
+ *     @rchan: the channel
+ *     @slot_len: the length of the slot to reserve
+ *     @ts: variable that will receive the time the slot was reserved
+ *     @tsc: the timestamp counter associated with time
+ *     @err: receives the result flags
+ *     @interrupting: not used
+ *
+ *     Returns pointer to the beginning of the reserved slot, NULL if error.
+ *
+ *     The err value contains the result flags and is an ORed combination 
+ *     of the following:
+ *
+ *     RELAY_BUFFER_SWITCH_NONE - no buffer switch occurred
+ *     RELAY_EVENT_DISCARD_NONE - event should not be discarded
+ *     RELAY_BUFFER_SWITCH - buffer switch occurred
+ *     RELAY_EVENT_DISCARD - event should be discarded (all buffers are full)
+ *     RELAY_EVENT_TOO_LONG - event won't fit into even an empty buffer
+ */
+inline char * 
+lockless_reserve(struct rchan *rchan,
+                u32 slot_len,
+                struct timeval *ts,
+                u32 *tsc,
+                int *err,
+                int *interrupting)
+{
+       u32 old_idx, new_idx, offset;
+       u32 offset_mask = offset_mask(rchan);
+
+       do {
+               old_idx = idx(rchan);
+               new_idx = old_idx + slot_len;
+
+               offset = RELAY_BUF_OFFSET_GET(new_idx + rchan->end_reserve, 
+                                             offset_mask);
+               if (offset < slot_len)
+                       return lockless_reserve_slow(rchan, slot_len, 
+                                    ts, tsc, old_idx, err);
+               get_time_or_tsc(ts, tsc, rchan);
+       } while (!compare_and_store_volatile(&idx(rchan), old_idx, new_idx));
+
+       /* If not using TSC, need to calc time delta */
+       recalc_time_delta(ts, tsc, rchan);
+
+       *err = RELAY_BUFFER_SWITCH_NONE;
+
+       return rchan->buf + (old_idx & idx_mask(rchan));
+}
+
+/**
+ *     lockless_get_offset - get current and max channel offsets
+ *     @rchan: the channel
+ *     @max_offset: maximum channel offset
+ *
+ *     Returns the current and maximum channel offsets.
+ */
+u32 
+lockless_get_offset(struct rchan *rchan,
+                       u32 *max_offset)
+{
+       if (max_offset)
+               *max_offset = rchan->buf_size * rchan->n_bufs - 1;
+
+       return rchan->initialized ? idx(rchan) & idx_mask(rchan) : 0;
+}
+
+/**
+ *     lockless_reset - reset the channel
+ *     @rchan: the channel
+ *     @init: 1 if this is a first-time channel initialization
+ */
+void lockless_reset(struct rchan *rchan, int init)
+{
+       int i;
+       
+       /* Start first buffer at 0 - (end_reserve + 1) so that it
+          gets initialized via buffer_start callback as well. */
+       idx(rchan) =  0UL - (rchan->end_reserve + 1);
+       idx_mask(rchan) =
+               (1UL << (bufno_bits(rchan) + offset_bits(rchan))) - 1;
+       atomic_set(&fill_count(rchan, 0), 
+                  (int)rchan->start_reserve + 
+                  (int)rchan->rchan_start_reserve);
+       for (i = 1; i < rchan->n_bufs; i++)
+               atomic_set(&fill_count(rchan, i),
+                          (int)RELAY_BUF_SIZE(offset_bits(rchan)));
+}
+
+/**
+ *     lockless_reset_index - atomically set channel index to the beginning
+ *     @rchan: the channel
+ *     @old_idx: the current index
+ *
+ *     If this fails, it means that something else just logged something
+ *     and therefore we probably no longer want to do this.  It's up to the
+ *     caller anyway...
+ *
+ *     Returns 0 if the index was successfully set, negative otherwise
+ */
+int
+lockless_reset_index(struct rchan *rchan, u32 old_idx)
+{
+       struct timeval ts;
+       u32 tsc;
+       u32 new_idx;
+
+       if (compare_and_store_volatile(&idx(rchan), old_idx, 0)) {
+               new_idx = rchan->start_reserve;
+               switch_buffers(rchan, 0, 0, &ts, &tsc, new_idx, old_idx, 1);
+               return 0;
+       } else
+               return -1;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/fs/relayfs/relay_lockless.h b/fs/relayfs/relay_lockless.h
new file mode 100644 (file)
index 0000000..8d4189e
--- /dev/null
@@ -0,0 +1,34 @@
+#ifndef _RELAY_LOCKLESS_H
+#define _RELAY_LOCKLESS_H
+
+extern char *
+lockless_reserve(struct rchan *rchan,
+                u32 slot_len,
+                struct timeval *time_stamp,
+                u32 *tsc,
+                int * interrupting,
+                int * errcode);
+
+extern void 
+lockless_commit(struct rchan *rchan,
+               char * from,
+               u32 len, 
+               int deliver, 
+               int interrupting);
+
+extern void 
+lockless_resume(struct rchan *rchan);
+
+extern void 
+lockless_finalize(struct rchan *rchan);
+
+extern u32 
+lockless_get_offset(struct rchan *rchan, u32 *max_offset);
+
+extern void
+lockless_reset(struct rchan *rchan, int init);
+
+extern int
+lockless_reset_index(struct rchan *rchan, u32 old_idx);
+
+#endif/* _RELAY_LOCKLESS_H */
diff --git a/fs/relayfs/resize.c b/fs/relayfs/resize.c
new file mode 100644 (file)
index 0000000..25f00bf
--- /dev/null
@@ -0,0 +1,1091 @@
+/*
+ * RelayFS buffer management and resizing code.
+ *
+ * Copyright (C) 2002, 2003 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
+ * Copyright (C) 1999, 2000, 2001, 2002 - Karim Yaghmour (karim@opersys.com)
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <asm/relay.h>
+#include "resize.h"
+
+/**
+ *     alloc_page_array - alloc array to hold pages, but not pages
+ *     @size: the total size of the memory represented by the page array
+ *     @page_count: the number of pages the array can hold
+ *     @err: 0 on success, negative otherwise
+ *
+ *     Returns a pointer to the page array if successful, NULL otherwise.
+ */
+static struct page **
+alloc_page_array(int size, int *page_count, int *err)
+{
+       int n_pages;
+       struct page **page_array;
+       int page_array_size;
+
+       *err = 0;
+       
+       size = PAGE_ALIGN(size);
+       n_pages = size >> PAGE_SHIFT;
+       page_array_size = n_pages * sizeof(struct page *);
+       page_array = kmalloc(page_array_size, GFP_KERNEL);
+       if (page_array == NULL) {
+               *err = -ENOMEM;
+               return NULL;
+       }
+       *page_count = n_pages;
+       memset(page_array, 0, page_array_size);
+
+       return page_array;
+}
+
+/**
+ *     free_page_array - free array to hold pages, but not pages
+ *     @page_array: pointer to the page array
+ */
+static inline void
+free_page_array(struct page **page_array)
+{
+       kfree(page_array);
+}
+
+/**
+ *     depopulate_page_array - free and unreserve all pages in the array
+ *     @page_array: pointer to the page array
+ *     @page_count: number of pages to free
+ */
+static void
+depopulate_page_array(struct page **page_array, int page_count)
+{
+       int i;
+       
+       for (i = 0; i < page_count; i++) {
+               ClearPageReserved(page_array[i]);
+               __free_page(page_array[i]);
+       }
+}
+
+/**
+ *     populate_page_array - allocate and reserve pages
+ *     @page_array: pointer to the page array
+ *     @page_count: number of pages to allocate
+ *
+ *     Returns 0 if successful, negative otherwise.
+ */
+static int
+populate_page_array(struct page **page_array, int page_count)
+{
+       int i;
+       
+       for (i = 0; i < page_count; i++) {
+               page_array[i] = alloc_page(GFP_KERNEL);
+               if (unlikely(!page_array[i])) {
+                       depopulate_page_array(page_array, i);
+                       return -ENOMEM;
+               }
+               SetPageReserved(page_array[i]);
+       }
+       return 0;
+}
+
+/**
+ *     alloc_rchan_buf - allocate the initial channel buffer
+ *     @size: total size of the buffer
+ *     @page_array: receives a pointer to the buffer's page array
+ *     @page_count: receives the number of pages allocated
+ *
+ *     Returns a pointer to the resulting buffer, NULL if unsuccessful
+ */
+void *
+alloc_rchan_buf(unsigned long size, struct page ***page_array, int *page_count)
+{
+       void *mem;
+       int err;
+
+       *page_array = alloc_page_array(size, page_count, &err);
+       if (!*page_array)
+               return NULL;
+
+       err = populate_page_array(*page_array, *page_count);
+       if (err) {
+               free_page_array(*page_array);
+               *page_array = NULL;
+               return NULL;
+       }
+
+       mem = vmap(*page_array, *page_count, GFP_KERNEL, PAGE_KERNEL);
+       if (!mem) {
+               depopulate_page_array(*page_array, *page_count);
+               free_page_array(*page_array);
+               *page_array = NULL;
+               return NULL;
+       }
+       memset(mem, 0, size);
+
+       return mem;
+}
+
+/**
+ *     free_rchan_buf - free a channel buffer
+ *     @buf: pointer to the buffer to free
+ *     @page_array: pointer to the buffer's page array
+ *     @page_count: number of pages in page array
+ */
+void
+free_rchan_buf(void *buf, struct page **page_array, int page_count)
+{
+       vunmap(buf);
+       depopulate_page_array(page_array, page_count);
+       free_page_array(page_array);
+}
+
+/**
+ *     expand_check - check whether the channel needs expanding
+ *     @rchan: the channel
+ *
+ *     If the channel needs expanding, the needs_resize callback is
+ *     called with RELAY_RESIZE_EXPAND.
+ *
+ *     Returns the suggested number of sub-buffers for the new
+ *     buffer.
+ */
+void
+expand_check(struct rchan *rchan)
+{
+       u32 active_bufs;
+       u32 new_n_bufs = 0;
+       u32 threshold = rchan->n_bufs * RESIZE_THRESHOLD;
+
+       if (rchan->init_buf)
+               return;
+
+       if (rchan->resize_min == 0)
+               return;
+
+       if (rchan->resizing || rchan->replace_buffer)
+               return;
+       
+       active_bufs = rchan->bufs_produced - rchan->bufs_consumed + 1;
+
+       if (rchan->resize_max && active_bufs == threshold) {
+               new_n_bufs = rchan->n_bufs * 2;
+       }
+
+       if (new_n_bufs && (new_n_bufs * rchan->buf_size <= rchan->resize_max))
+               rchan->callbacks->needs_resize(rchan->id,
+                                              RELAY_RESIZE_EXPAND,
+                                              rchan->buf_size, 
+                                              new_n_bufs);
+}
+
+/**
+ *     can_shrink - check whether the channel can shrink
+ *     @rchan: the channel
+ *     @cur_idx: the current channel index
+ *
+ *     Returns the suggested number of sub-buffers for the new
+ *     buffer, 0 if the buffer is not shrinkable.
+ */
+static inline u32
+can_shrink(struct rchan *rchan, u32 cur_idx)
+{
+       u32 active_bufs = rchan->bufs_produced - rchan->bufs_consumed + 1;
+       u32 new_n_bufs = 0;
+       u32 cur_bufno_bytes = cur_idx % rchan->buf_size;
+
+       if (rchan->resize_min == 0 ||
+           rchan->resize_min >= rchan->n_bufs * rchan->buf_size)
+               goto out;
+       
+       if (active_bufs > 1)
+               goto out;
+
+       if (cur_bufno_bytes != rchan->bytes_consumed)
+               goto out;
+       
+       new_n_bufs = rchan->resize_min / rchan->buf_size;
+out:
+       return new_n_bufs;
+}
+
+/**
+ *     shrink_check: - timer function checking whether the channel can shrink
+ *     @data: unused
+ *
+ *     Every SHRINK_TIMER_SECS, check whether the channel is shrinkable.
+ *     If so, we attempt to atomically reset the channel to the beginning.
+ *     The needs_resize callback is then called with RELAY_RESIZE_SHRINK.
+ *     If the reset fails, it means we really shouldn't be shrinking now
+ *     and need to wait until the next time around.
+ */
+static void
+shrink_check(unsigned long data)
+{
+       struct rchan *rchan = (struct rchan *)data;
+       u32 shrink_to_nbufs, cur_idx;
+       
+       del_timer(&rchan->shrink_timer);
+       rchan->shrink_timer.expires = jiffies + SHRINK_TIMER_SECS * HZ;
+       add_timer(&rchan->shrink_timer);
+
+       if (rchan->init_buf)
+               return;
+
+       if (rchan->resizing || rchan->replace_buffer)
+               return;
+
+       if (using_lockless(rchan))
+               cur_idx = idx(rchan);
+       else
+               cur_idx = relay_get_offset(rchan, NULL);
+
+       shrink_to_nbufs = can_shrink(rchan, cur_idx);
+       if (shrink_to_nbufs != 0 && reset_index(rchan, cur_idx) == 0) {
+               update_readers_consumed(rchan, rchan->bufs_consumed, 0);
+               rchan->callbacks->needs_resize(rchan->id,
+                                              RELAY_RESIZE_SHRINK,
+                                              rchan->buf_size, 
+                                              shrink_to_nbufs);
+       }
+}
+
+/**
+ *     init_shrink_timer: - Start timer used to check shrinkability.
+ *     @rchan: the channel
+ */
+void
+init_shrink_timer(struct rchan *rchan)
+{
+       if (rchan->resize_min) {
+               init_timer(&rchan->shrink_timer);
+               rchan->shrink_timer.function = shrink_check;
+               rchan->shrink_timer.data = (unsigned long)rchan;
+               rchan->shrink_timer.expires = jiffies + SHRINK_TIMER_SECS * HZ;
+               add_timer(&rchan->shrink_timer);
+       }
+}
+
+
+/**
+ *     alloc_new_pages - allocate new pages for expanding buffer
+ *     @rchan: the channel
+ *
+ *     Returns 0 on success, negative otherwise.
+ */
+static int
+alloc_new_pages(struct rchan *rchan)
+{
+       int new_pages_size, err;
+
+       if (unlikely(rchan->expand_page_array)) BUG();
+
+       new_pages_size = rchan->resize_alloc_size - rchan->alloc_size;
+       rchan->expand_page_array = alloc_page_array(new_pages_size,
+                                           &rchan->expand_page_count, &err);
+       if (rchan->expand_page_array == NULL) {
+               rchan->resize_err = -ENOMEM;
+               return -ENOMEM;
+       }
+       
+       err = populate_page_array(rchan->expand_page_array,
+                                 rchan->expand_page_count);
+       if (err) {
+               rchan->resize_err = -ENOMEM;
+               free_page_array(rchan->expand_page_array);
+               rchan->expand_page_array = NULL;
+       }
+
+       return err;
+}
+
+/**
+ *     clear_resize_offset - helper function for buffer resizing
+ *     @rchan: the channel
+ *
+ *     Clear the saved offset change.
+ */
+static inline void
+clear_resize_offset(struct rchan *rchan)
+{
+       rchan->resize_offset.ge = 0UL;
+       rchan->resize_offset.le = 0UL;
+       rchan->resize_offset.delta = 0;
+}
+
+/**
+ *     save_resize_offset - helper function for buffer resizing
+ *     @rchan: the channel
+ *     @ge: affected region ge this
+ *     @le: affected region le this
+ *     @delta: apply this delta
+ *
+ *     Save a resize offset.
+ */
+static inline void
+save_resize_offset(struct rchan *rchan, u32 ge, u32 le, int delta)
+{
+       rchan->resize_offset.ge = ge;
+       rchan->resize_offset.le = le;
+       rchan->resize_offset.delta = delta;
+}
+
+/**
+ *     update_file_offset - apply offset change to reader
+ *     @reader: the channel reader
+ *     @change_idx: the offset index into the offsets array
+ *
+ *     Returns non-zero if the offset was applied.
+ *
+ *     Apply the offset delta saved in change_idx to the reader's
+ *     current read position.
+ */
+static inline int
+update_file_offset(struct rchan_reader *reader)
+{
+       int applied = 0;
+       struct rchan *rchan = reader->rchan;
+       u32 f_pos;
+       int delta = reader->rchan->resize_offset.delta;
+
+       if (reader->vfs_reader)
+               f_pos = (u32)reader->pos.file->f_pos;
+       else
+               f_pos = reader->pos.f_pos;
+
+       if (f_pos == relay_get_offset(rchan, NULL))
+               return 0;
+
+       if ((f_pos >= rchan->resize_offset.ge - 1) &&
+           (f_pos <= rchan->resize_offset.le)) {
+               if (reader->vfs_reader)
+                       reader->pos.file->f_pos += delta;
+               else
+                       reader->pos.f_pos += delta;
+               applied = 1;
+       }
+
+       return applied;
+}
+
+/**
+ *     update_file_offsets - apply offset change to readers
+ *     @rchan: the channel
+ *
+ *     Apply the saved offset deltas to all files open on the channel.
+ */
+static inline void
+update_file_offsets(struct rchan *rchan)
+{
+       struct list_head *p;
+       struct rchan_reader *reader;
+       
+       read_lock(&rchan->open_readers_lock);
+       list_for_each(p, &rchan->open_readers) {
+               reader = list_entry(p, struct rchan_reader, list);
+               if (update_file_offset(reader))
+                       reader->offset_changed = 1;
+       }
+       read_unlock(&rchan->open_readers_lock);
+}
+
+/**
+ *     setup_expand_buf - setup expand buffer for replacement
+ *     @rchan: the channel
+ *     @newsize: the size of the new buffer
+ *     @oldsize: the size of the old buffer
+ *     @old_n_bufs: the number of sub-buffers in the old buffer
+ *
+ *     Inserts new pages into the old buffer to create a larger
+ *     new channel buffer, splitting them at old_cur_idx, the bottom
+ *     half of the old buffer going to the bottom of the new, likewise
+ *     for the top half.
+ */
+static void
+setup_expand_buf(struct rchan *rchan, int newsize, int oldsize, u32 old_n_bufs)
+{
+       u32 cur_idx;
+       int cur_bufno, delta, i, j;
+       u32 ge, le;
+       int cur_pageno;
+       u32 free_bufs, free_pages;
+       u32 free_pages_in_cur_buf;
+       u32 free_bufs_to_end;
+       u32 cur_pages = rchan->alloc_size >> PAGE_SHIFT;
+       u32 pages_per_buf = cur_pages / rchan->n_bufs;
+       u32 bufs_ready = rchan->bufs_produced - rchan->bufs_consumed;
+
+       if (!rchan->resize_page_array || !rchan->expand_page_array ||
+           !rchan->buf_page_array)
+               return;
+
+       if (bufs_ready >= rchan->n_bufs) {
+               bufs_ready = rchan->n_bufs;
+               free_bufs = 0;
+       } else
+               free_bufs = rchan->n_bufs - bufs_ready - 1;
+
+       cur_idx = relay_get_offset(rchan, NULL);
+       cur_pageno = cur_idx / PAGE_SIZE;
+       cur_bufno = cur_idx / rchan->buf_size;
+
+       free_pages_in_cur_buf = (pages_per_buf - 1) - (cur_pageno % pages_per_buf);
+       free_pages = free_bufs * pages_per_buf + free_pages_in_cur_buf;
+       free_bufs_to_end = (rchan->n_bufs - 1) - cur_bufno;
+       if (free_bufs >= free_bufs_to_end) {
+               free_pages = free_bufs_to_end * pages_per_buf + free_pages_in_cur_buf;
+               free_bufs = free_bufs_to_end;
+       }
+               
+       for (i = 0, j = 0; i <= cur_pageno + free_pages; i++, j++)
+               rchan->resize_page_array[j] = rchan->buf_page_array[i];
+       for (i = 0; i < rchan->expand_page_count; i++, j++)
+               rchan->resize_page_array[j] = rchan->expand_page_array[i];
+       for (i = cur_pageno + free_pages + 1; i < rchan->buf_page_count; i++, j++)
+               rchan->resize_page_array[j] = rchan->buf_page_array[i];
+
+       delta = newsize - oldsize;
+       ge = (cur_pageno + 1 + free_pages) * PAGE_SIZE;
+       le = oldsize;
+       save_resize_offset(rchan, ge, le, delta);
+
+       rchan->expand_buf_id = rchan->buf_id + 1 + free_bufs;
+}
+
+/**
+ *     setup_shrink_buf - setup shrink buffer for replacement
+ *     @rchan: the channel
+ *
+ *     Removes pages from the old buffer to create a smaller
+ *     new channel buffer.
+ */
+static void
+setup_shrink_buf(struct rchan *rchan)
+{
+       int i;
+       int copy_end_page;
+
+       if (!rchan->resize_page_array || !rchan->shrink_page_array || 
+           !rchan->buf_page_array)
+               return;
+       
+       copy_end_page = rchan->resize_alloc_size / PAGE_SIZE;
+
+       for (i = 0; i < copy_end_page; i++)
+               rchan->resize_page_array[i] = rchan->buf_page_array[i];
+}
+
+/**
+ *     cleanup_failed_alloc - relaybuf_alloc helper
+ */
+static void
+cleanup_failed_alloc(struct rchan *rchan)
+{
+       if (rchan->expand_page_array) {
+               depopulate_page_array(rchan->expand_page_array,
+                                     rchan->expand_page_count);
+               free_page_array(rchan->expand_page_array);
+               rchan->expand_page_array = NULL;
+               rchan->expand_page_count = 0;
+       } else if (rchan->shrink_page_array) {
+               free_page_array(rchan->shrink_page_array);
+               rchan->shrink_page_array = NULL;
+               rchan->shrink_page_count = 0;
+       }
+
+       if (rchan->resize_page_array) {
+               free_page_array(rchan->resize_page_array);
+               rchan->resize_page_array = NULL;
+               rchan->resize_page_count = 0;
+       }
+}
+
+/**
+ *     relaybuf_alloc - allocate a new resized channel buffer
+ *     @private: pointer to the channel struct
+ *
+ *     Internal - manages the allocation and remapping of new channel
+ *     buffers.
+ */
+static void 
+relaybuf_alloc(void *private)
+{
+       struct rchan *rchan = (struct rchan *)private;
+       int i, j, err;
+       u32 old_cur_idx;
+       int free_size;
+       int free_start_page, free_end_page;
+       u32 newsize, oldsize;
+
+       if (rchan->resize_alloc_size > rchan->alloc_size) {
+               err = alloc_new_pages(rchan);
+               if (err) goto cleanup;
+       } else {
+               free_size = rchan->alloc_size - rchan->resize_alloc_size;
+               BUG_ON(free_size <= 0);
+               rchan->shrink_page_array = alloc_page_array(free_size,
+                                           &rchan->shrink_page_count, &err);
+               if (rchan->shrink_page_array == NULL)
+                       goto cleanup;
+               free_start_page = rchan->resize_alloc_size / PAGE_SIZE;
+               free_end_page = rchan->alloc_size / PAGE_SIZE;
+               for (i = 0, j = free_start_page; j < free_end_page; i++, j++)
+                       rchan->shrink_page_array[i] = rchan->buf_page_array[j];
+       }
+
+       rchan->resize_page_array = alloc_page_array(rchan->resize_alloc_size,
+                                           &rchan->resize_page_count, &err);
+       if (rchan->resize_page_array == NULL)
+               goto cleanup;
+
+       old_cur_idx = relay_get_offset(rchan, NULL);
+       clear_resize_offset(rchan);
+       newsize = rchan->resize_alloc_size;
+       oldsize = rchan->alloc_size;
+       if (newsize > oldsize)
+               setup_expand_buf(rchan, newsize, oldsize, rchan->n_bufs);
+       else
+               setup_shrink_buf(rchan);
+
+       rchan->resize_buf = vmap(rchan->resize_page_array, rchan->resize_page_count, GFP_KERNEL, PAGE_KERNEL);
+
+       if (rchan->resize_buf == NULL)
+               goto cleanup;
+
+       rchan->replace_buffer = 1;
+       rchan->resizing = 0;
+
+       rchan->callbacks->needs_resize(rchan->id, RELAY_RESIZE_REPLACE, 0, 0);
+       return;
+
+cleanup:
+       cleanup_failed_alloc(rchan);
+       rchan->resize_err = -ENOMEM;
+       return;
+}
+
+/**
+ *     relaybuf_free - free a resized channel buffer
+ *     @private: pointer to the channel struct
+ *
+ *     Internal - manages the de-allocation and unmapping of old channel
+ *     buffers.
+ */
+static void
+relaybuf_free(void *private)
+{
+       struct free_rchan_buf *free_buf = (struct free_rchan_buf *)private;
+       int i;
+
+       if (free_buf->unmap_buf)
+               vunmap(free_buf->unmap_buf);
+
+       for (i = 0; i < 3; i++) {
+               if (!free_buf->page_array[i].array)
+                       continue;
+               if (free_buf->page_array[i].count)
+                       depopulate_page_array(free_buf->page_array[i].array,
+                                             free_buf->page_array[i].count);
+               free_page_array(free_buf->page_array[i].array);
+       }
+
+       kfree(free_buf);
+}
+
+/**
+ *     calc_order - determine the power-of-2 order of a resize
+ *     @high: the larger size
+ *     @low: the smaller size
+ *
+ *     Returns order
+ */
+static inline int
+calc_order(u32 high, u32 low)
+{
+       int order = 0;
+       
+       if (!high || !low || high <= low)
+               return 0;
+       
+       while (high > low) {
+               order++;
+               high /= 2;
+       }
+       
+       return order;
+}
+
+/**
+ *     check_size - check the sanity of the requested channel size
+ *     @rchan: the channel
+ *     @nbufs: the new number of sub-buffers
+ *     @err: return code
+ *
+ *     Returns the non-zero total buffer size if ok, otherwise 0 and
+ *     sets errcode if not.
+ */
+static inline u32
+check_size(struct rchan *rchan, u32 nbufs, int *err)
+{
+       u32 new_channel_size = 0;
+
+       *err = 0;
+       
+       if (nbufs > rchan->n_bufs) {
+               rchan->resize_order = calc_order(nbufs, rchan->n_bufs);
+               if (!rchan->resize_order) {
+                       *err = -EINVAL;
+                       goto out;
+               }
+
+               new_channel_size = rchan->buf_size * nbufs;
+               if (new_channel_size > rchan->resize_max) {
+                       *err = -EINVAL;
+                       goto out;
+               }
+       } else if (nbufs < rchan->n_bufs) {
+               if (rchan->n_bufs < 2) {
+                       *err = -EINVAL;
+                       goto out;
+               }
+               rchan->resize_order = -calc_order(rchan->n_bufs, nbufs);
+               if (!rchan->resize_order) {
+                       *err = -EINVAL;
+                       goto out;
+               }
+               
+               new_channel_size = rchan->buf_size * nbufs;
+               if (new_channel_size < rchan->resize_min) {
+                       *err = -EINVAL;
+                       goto out;
+               }
+       } else
+               *err = -EINVAL;
+out:
+       return new_channel_size;
+}
+
+/**
+ *     __relay_realloc_buffer - allocate a new channel buffer
+ *     @rchan: the channel
+ *     @new_nbufs: the new number of sub-buffers
+ *     @async: do the allocation using a work queue
+ *
+ *     Internal - see relay_realloc_buffer() for details.
+ */
+static int
+__relay_realloc_buffer(struct rchan *rchan, u32 new_nbufs, int async)
+{
+       u32 new_channel_size;
+       int err = 0;
+       
+       if (new_nbufs == rchan->n_bufs)
+               return -EINVAL;
+               
+       if (down_trylock(&rchan->resize_sem))
+               return -EBUSY;
+
+       if (rchan->init_buf) {
+               err = -EPERM;
+               goto out;
+       }
+
+       if (rchan->replace_buffer) {
+               err = -EBUSY;
+               goto out;
+       }
+
+       if (rchan->resizing) {
+               err = -EBUSY;
+               goto out;
+       } else
+               rchan->resizing = 1;
+
+       if (rchan->resize_failures > MAX_RESIZE_FAILURES) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       new_channel_size = check_size(rchan, new_nbufs, &err);
+       if (err)
+               goto out;
+       
+       rchan->resize_n_bufs = new_nbufs;
+       rchan->resize_buf_size = rchan->buf_size;
+       rchan->resize_alloc_size = FIX_SIZE(new_channel_size);
+       
+       if (async) {
+               INIT_WORK(&rchan->work, relaybuf_alloc, rchan);
+               schedule_delayed_work(&rchan->work, 1);
+       } else
+               relaybuf_alloc((void *)rchan);
+out:
+       up(&rchan->resize_sem);
+       
+       return err;
+}
+
+/**
+ *     relay_realloc_buffer - allocate a new channel buffer
+ *     @rchan_id: the channel id
+ *     @bufsize: the new sub-buffer size
+ *     @nbufs: the new number of sub-buffers
+ *
+ *     Allocates a new channel buffer using the specified sub-buffer size
+ *     and count.  If async is non-zero, the allocation is done in the
+ *     background using a work queue.  When the allocation has completed,
+ *     the needs_resize() callback is called with a resize_type of
+ *     RELAY_RESIZE_REPLACE.  This function doesn't replace the old buffer
+ *     with the new - see relay_replace_buffer().  See
+ *     Documentation/filesystems/relayfs.txt for more details.
+ *
+ *     Returns 0 on success, or errcode if the channel is busy or if
+ *     the allocation couldn't happen for some reason.
+ */
+int
+relay_realloc_buffer(int rchan_id, u32 new_nbufs, int async)
+{
+       int err;
+       
+       struct rchan *rchan;
+
+       rchan = rchan_get(rchan_id);
+       if (rchan == NULL)
+               return -EBADF;
+
+       err = __relay_realloc_buffer(rchan, new_nbufs, async);
+       
+       rchan_put(rchan);
+
+       return err;
+}
+
+/**
+ *     expand_cancel_check - check whether the current expand needs canceling
+ *     @rchan: the channel
+ *
+ *     Returns 1 if the expand should be canceled, 0 otherwise.
+ */
+static int
+expand_cancel_check(struct rchan *rchan)
+{
+       if (rchan->buf_id >= rchan->expand_buf_id)
+               return 1;
+       else
+               return 0;
+}
+
+/**
+ *     shrink_cancel_check - check whether the current shrink needs canceling
+ *     @rchan: the channel
+ *
+ *     Returns 1 if the shrink should be canceled, 0 otherwise.
+ */
+static int
+shrink_cancel_check(struct rchan *rchan, u32 newsize)
+{
+       u32 active_bufs = rchan->bufs_produced - rchan->bufs_consumed + 1;
+       u32 cur_idx = relay_get_offset(rchan, NULL);
+
+       if (cur_idx >= newsize)
+               return 1;
+
+       if (active_bufs > 1)
+               return 1;
+
+       return 0;
+}
+
+/**
+ *     switch_rchan_buf - do_replace_buffer helper
+ */
+static void
+switch_rchan_buf(struct rchan *rchan,
+                int newsize,
+                int oldsize,
+                u32 old_nbufs,
+                u32 cur_idx)
+{
+       u32 newbufs, cur_bufno;
+       int i;
+
+       cur_bufno = cur_idx / rchan->buf_size;
+
+       rchan->buf = rchan->resize_buf;
+       rchan->alloc_size = rchan->resize_alloc_size;
+       rchan->n_bufs = rchan->resize_n_bufs;
+
+       if (newsize > oldsize) {
+               u32 ge = rchan->resize_offset.ge;
+               u32 moved_buf = ge / rchan->buf_size;
+
+               newbufs = (newsize - oldsize) / rchan->buf_size;
+               for (i = moved_buf; i < old_nbufs; i++) {
+                       if (using_lockless(rchan))
+                               atomic_set(&fill_count(rchan, i + newbufs), 
+                                          atomic_read(&fill_count(rchan, i)));
+                       rchan->unused_bytes[i + newbufs] = rchan->unused_bytes[i];
+               }
+               for (i = moved_buf; i < moved_buf + newbufs; i++) {
+                       if (using_lockless(rchan))
+                               atomic_set(&fill_count(rchan, i),
+                                          (int)RELAY_BUF_SIZE(offset_bits(rchan)));
+                       rchan->unused_bytes[i] = 0;
+               }
+       }
+
+       rchan->buf_idx = cur_bufno;
+
+       if (!using_lockless(rchan)) {
+               cur_write_pos(rchan) = rchan->buf + cur_idx;
+               write_buf(rchan) = rchan->buf + cur_bufno * rchan->buf_size;
+               write_buf_end(rchan) = write_buf(rchan) + rchan->buf_size;
+               write_limit(rchan) = write_buf_end(rchan) - rchan->end_reserve;
+       } else {
+               idx(rchan) &= idx_mask(rchan);
+               bufno_bits(rchan) += rchan->resize_order;
+               idx_mask(rchan) =
+                       (1UL << (bufno_bits(rchan) + offset_bits(rchan))) - 1;
+       }
+}
+
+/**
+ *     do_replace_buffer - does the work of channel buffer replacement
+ *     @rchan: the channel
+ *     @newsize: new channel buffer size
+ *     @oldsize: old channel buffer size
+ *     @old_n_bufs: old channel sub-buffer count
+ *
+ *     Returns 0 if replacement happened, 1 if canceled
+ *
+ *     Does the work of switching buffers and fixing everything up
+ *     so the channel can continue with a new size.
+ */
+static int
+do_replace_buffer(struct rchan *rchan,
+                 int newsize,
+                 int oldsize,
+                 u32 old_nbufs)
+{
+       u32 cur_idx;
+       int err = 0;
+       int canceled;
+
+       cur_idx = relay_get_offset(rchan, NULL);
+
+       if (newsize > oldsize)
+               canceled = expand_cancel_check(rchan);
+       else
+               canceled = shrink_cancel_check(rchan, newsize);
+
+       if (canceled) {
+               err = -EAGAIN;
+               goto out;
+       }
+
+       switch_rchan_buf(rchan, newsize, oldsize, old_nbufs, cur_idx);
+
+       if (rchan->resize_offset.delta)
+               update_file_offsets(rchan);
+
+       atomic_set(&rchan->suspended, 0);
+
+       rchan->old_buf_page_array = rchan->buf_page_array;
+       rchan->buf_page_array = rchan->resize_page_array;
+       rchan->buf_page_count = rchan->resize_page_count;
+       rchan->resize_page_array = NULL;
+       rchan->resize_page_count = 0;
+       rchan->resize_buf = NULL;
+       rchan->resize_buf_size = 0;
+       rchan->resize_alloc_size = 0;
+       rchan->resize_n_bufs = 0;
+       rchan->resize_err = 0;
+       rchan->resize_order = 0;
+out:
+       rchan->callbacks->needs_resize(rchan->id,
+                                      RELAY_RESIZE_REPLACED,
+                                      rchan->buf_size,
+                                      rchan->n_bufs);
+       return err;
+}
+
+/**
+ *     add_free_page_array - add a page_array to be freed
+ *     @free_rchan_buf: the free_rchan_buf struct
+ *     @page_array: the page array to free
+ *     @page_count: the number of pages to free, 0 to free the array only
+ *
+ *     Internal - Used add page_arrays to be freed asynchronously.
+ */
+static inline void
+add_free_page_array(struct free_rchan_buf *free_rchan_buf,
+                   struct page **page_array, int page_count)
+{
+       int cur = free_rchan_buf->cur++;
+       
+       free_rchan_buf->page_array[cur].array = page_array;
+       free_rchan_buf->page_array[cur].count = page_count;
+}
+
+/**
+ *     free_replaced_buffer - free a channel's old buffer
+ *     @rchan: the channel
+ *     @oldbuf: the old buffer
+ *     @oldsize: old buffer size
+ *
+ *     Frees a channel buffer via work queue.
+ */
+static int
+free_replaced_buffer(struct rchan *rchan, char *oldbuf, int oldsize)
+{
+       struct free_rchan_buf *free_buf;
+
+       free_buf = kmalloc(sizeof(struct free_rchan_buf), GFP_ATOMIC);
+       if (!free_buf)
+               return -ENOMEM;
+       memset(free_buf, 0, sizeof(struct free_rchan_buf));
+
+       free_buf->unmap_buf = oldbuf;
+       add_free_page_array(free_buf, rchan->old_buf_page_array, 0);
+       rchan->old_buf_page_array = NULL;
+       add_free_page_array(free_buf, rchan->expand_page_array, 0);
+       add_free_page_array(free_buf, rchan->shrink_page_array, rchan->shrink_page_count);
+
+       rchan->expand_page_array = NULL;
+       rchan->expand_page_count = 0;
+       rchan->shrink_page_array = NULL;
+       rchan->shrink_page_count = 0;
+
+       INIT_WORK(&free_buf->work, relaybuf_free, free_buf);
+       schedule_delayed_work(&free_buf->work, 1);
+
+       return 0;
+}
+
+/**
+ *     free_canceled_resize - free buffers allocated for a canceled resize
+ *     @rchan: the channel
+ *
+ *     Frees canceled buffers via work queue.
+ */
+static int
+free_canceled_resize(struct rchan *rchan)
+{
+       struct free_rchan_buf *free_buf;
+
+       free_buf = kmalloc(sizeof(struct free_rchan_buf), GFP_ATOMIC);
+       if (!free_buf)
+               return -ENOMEM;
+       memset(free_buf, 0, sizeof(struct free_rchan_buf));
+
+       if (rchan->resize_alloc_size > rchan->alloc_size)
+               add_free_page_array(free_buf, rchan->expand_page_array, rchan->expand_page_count);
+       else
+               add_free_page_array(free_buf, rchan->shrink_page_array, 0);
+       
+       add_free_page_array(free_buf, rchan->resize_page_array, 0);
+       free_buf->unmap_buf = rchan->resize_buf;
+
+       rchan->expand_page_array = NULL;
+       rchan->expand_page_count = 0;
+       rchan->shrink_page_array = NULL;
+       rchan->shrink_page_count = 0;
+       rchan->resize_page_array = NULL;
+       rchan->resize_page_count = 0;
+       rchan->resize_buf = NULL;
+
+       INIT_WORK(&free_buf->work, relaybuf_free, free_buf);
+       schedule_delayed_work(&free_buf->work, 1);
+
+       return 0;
+}
+
+/**
+ *     __relay_replace_buffer - replace channel buffer with new buffer
+ *     @rchan: the channel
+ *
+ *     Internal - see relay_replace_buffer() for details.
+ *
+ *     Returns 0 if successful, negative otherwise.
+ */
+static int
+__relay_replace_buffer(struct rchan *rchan)
+{
+       int oldsize;
+       int err = 0;
+       char *oldbuf;
+       
+       if (down_trylock(&rchan->resize_sem))
+               return -EBUSY;
+
+       if (rchan->init_buf) {
+               err = -EPERM;
+               goto out;
+       }
+
+       if (!rchan->replace_buffer)
+               goto out;
+
+       if (rchan->resizing) {
+               err = -EBUSY;
+               goto out;
+       }
+
+       if (rchan->resize_buf == NULL) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       oldbuf = rchan->buf;
+       oldsize = rchan->alloc_size;
+
+       err = do_replace_buffer(rchan, rchan->resize_alloc_size,
+                               oldsize, rchan->n_bufs);
+       if (err == 0)
+               err = free_replaced_buffer(rchan, oldbuf, oldsize);
+       else
+               err = free_canceled_resize(rchan);
+out:
+       rchan->replace_buffer = 0;
+       up(&rchan->resize_sem);
+       
+       return err;
+}
+
+/**
+ *     relay_replace_buffer - replace channel buffer with new buffer
+ *     @rchan_id: the channel id
+ *
+ *     Replaces the current channel buffer with the new buffer allocated
+ *     by relay_alloc_buffer and contained in the channel struct.  When the
+ *     replacement is complete, the needs_resize() callback is called with
+ *     RELAY_RESIZE_REPLACED.
+ *
+ *     Returns 0 on success, or errcode if the channel is busy or if
+ *     the replacement or previous allocation didn't happen for some reason.
+ */
+int
+relay_replace_buffer(int rchan_id)
+{
+       int err;
+       
+       struct rchan *rchan;
+
+       rchan = rchan_get(rchan_id);
+       if (rchan == NULL)
+               return -EBADF;
+
+       err = __relay_replace_buffer(rchan);
+       
+       rchan_put(rchan);
+
+       return err;
+}
+
+EXPORT_SYMBOL(relay_realloc_buffer);
+EXPORT_SYMBOL(relay_replace_buffer);
+
diff --git a/fs/relayfs/resize.h b/fs/relayfs/resize.h
new file mode 100644 (file)
index 0000000..6f06d22
--- /dev/null
@@ -0,0 +1,51 @@
+#ifndef _RELAY_RESIZE_H
+#define _RELAY_RESIZE_H
+
+/* 
+ * If the channel usage has been below the low water mark for more than
+ * this amount of time, we can shrink the buffer if necessary.
+ */
+#define SHRINK_TIMER_SECS      60
+
+/* This inspired by rtai/shmem */
+#define FIX_SIZE(x) (((x) - 1) & PAGE_MASK) + PAGE_SIZE
+
+/* Don't attempt resizing again after this many failures */
+#define MAX_RESIZE_FAILURES    1
+
+/* Trigger resizing if a resizable channel is this full */
+#define RESIZE_THRESHOLD       3 / 4
+
+/*
+ * Used for deferring resized channel free
+ */
+struct free_rchan_buf
+{
+       char *unmap_buf;
+       struct 
+       {
+               struct page **array;
+               int count;
+       } page_array[3];
+       
+       int cur;
+       struct work_struct work;        /* resize de-allocation work struct */
+};
+
+extern void *
+alloc_rchan_buf(unsigned long size,
+               struct page ***page_array,
+               int *page_count);
+
+extern void
+free_rchan_buf(void *buf,
+              struct page **page_array,
+              int page_count);
+
+extern void
+expand_check(struct rchan *rchan);
+
+extern void
+init_shrink_timer(struct rchan *rchan);
+
+#endif/* _RELAY_RESIZE_H */
diff --git a/include/asm-alpha/relay.h b/include/asm-alpha/relay.h
new file mode 100644 (file)
index 0000000..104091f
--- /dev/null
@@ -0,0 +1,5 @@
+#ifndef _ASM_ALPHA_RELAY_H
+#define _ASM_ALPHA_RELAY_H
+
+#include <asm-generic/relay.h>
+#endif
diff --git a/include/asm-arm/relay.h b/include/asm-arm/relay.h
new file mode 100644 (file)
index 0000000..f9913f1
--- /dev/null
@@ -0,0 +1,5 @@
+#ifndef _ASM_ARM_RELAY_H
+#define _ASM_ARM_RELAY_H
+
+#include <asm-generic/relay.h>
+#endif
diff --git a/include/asm-arm26/relay.h b/include/asm-arm26/relay.h
new file mode 100644 (file)
index 0000000..f9913f1
--- /dev/null
@@ -0,0 +1,5 @@
+#ifndef _ASM_ARM_RELAY_H
+#define _ASM_ARM_RELAY_H
+
+#include <asm-generic/relay.h>
+#endif
diff --git a/include/asm-cris/relay.h b/include/asm-cris/relay.h
new file mode 100644 (file)
index 0000000..30ee42c
--- /dev/null
@@ -0,0 +1,5 @@
+#ifndef _ASM_CRIS_RELAY_H
+#define _ASM_CRIS_RELAY_H
+
+#include <asm-generic/relay.h>
+#endif
diff --git a/include/asm-generic/relay.h b/include/asm-generic/relay.h
new file mode 100644 (file)
index 0000000..c6d8dea
--- /dev/null
@@ -0,0 +1,76 @@
+#ifndef _ASM_GENERIC_RELAY_H
+#define _ASM_GENERIC_RELAY_H
+/*
+ * linux/include/asm-generic/relay.h
+ *
+ * Copyright (C) 2002, 2003 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
+ * Copyright (C) 2002 - Karim Yaghmour (karim@opersys.com)
+ *
+ * Architecture-independent definitions for relayfs
+ */
+
+#include <linux/relayfs_fs.h>
+
+/**
+ *     get_time_delta - utility function for getting time delta
+ *     @now: pointer to a timeval struct that may be given current time
+ *     @rchan: the channel
+ *
+ *     Returns the time difference between the current time and the buffer
+ *     start time.
+ */
+static inline u32
+get_time_delta(struct timeval *now, struct rchan *rchan)
+{
+       u32 time_delta;
+
+       do_gettimeofday(now);
+       time_delta = calc_time_delta(now, &rchan->buf_start_time);
+
+       return time_delta;
+}
+
+/**
+ *     get_timestamp - utility function for getting a time and TSC pair
+ *     @now: current time
+ *     @tsc: the TSC associated with now
+ *     @rchan: the channel
+ *
+ *     Sets the value pointed to by now to the current time. Value pointed to
+ *     by tsc is not set since there is no generic TSC support.
+ */
+static inline void 
+get_timestamp(struct timeval *now, 
+             u32 *tsc,
+             struct rchan *rchan)
+{
+       do_gettimeofday(now);
+}
+
+/**
+ *     get_time_or_tsc: - Utility function for getting a time or a TSC.
+ *     @now: current time
+ *     @tsc: current TSC
+ *     @rchan: the channel
+ *
+ *     Sets the value pointed to by now to the current time.
+ */
+static inline void 
+get_time_or_tsc(struct timeval *now, 
+               u32 *tsc,
+               struct rchan *rchan)
+{
+       do_gettimeofday(now);
+}
+
+/**
+ *     have_tsc - does this platform have a useable TSC?
+ *
+ *     Returns 0.
+ */
+static inline int 
+have_tsc(void)
+{
+       return 0;
+}
+#endif
diff --git a/include/asm-h8300/relay.h b/include/asm-h8300/relay.h
new file mode 100644 (file)
index 0000000..34ebfdd
--- /dev/null
@@ -0,0 +1,5 @@
+#ifndef _ASM_H8300_RELAY_H
+#define _ASM_H8300_RELAY_H
+
+#include <asm-generic/relay.h>
+#endif
diff --git a/include/asm-i386/relay.h b/include/asm-i386/relay.h
new file mode 100644 (file)
index 0000000..98e5b72
--- /dev/null
@@ -0,0 +1,101 @@
+#ifndef _ASM_I386_RELAY_H
+#define _ASM_I386_RELAY_H
+/*
+ * linux/include/asm-i386/relay.h
+ *
+ * Copyright (C) 2002, 2003 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
+ * Copyright (C) 2002 - Karim Yaghmour (karim@opersys.com)
+ *
+ * i386 definitions for relayfs
+ */
+
+#include <linux/relayfs_fs.h>
+
+#ifdef CONFIG_X86_TSC
+#include <asm/msr.h>
+
+/**
+ *     get_time_delta - utility function for getting time delta
+ *     @now: pointer to a timeval struct that may be given current time
+ *     @rchan: the channel
+ *
+ *     Returns either the TSC if TSCs are being used, or the time and the
+ *     time difference between the current time and the buffer start time 
+ *     if TSCs are not being used.
+ */
+static inline u32
+get_time_delta(struct timeval *now, struct rchan *rchan)
+{
+       u32 time_delta;
+
+       if ((using_tsc(rchan) == 1) && cpu_has_tsc)
+               rdtscl(time_delta);
+       else {
+               do_gettimeofday(now);
+               time_delta = calc_time_delta(now, &rchan->buf_start_time);
+       }
+
+       return time_delta;
+}
+
+/**
+ *     get_timestamp - utility function for getting a time and TSC pair
+ *     @now: current time
+ *     @tsc: the TSC associated with now
+ *     @rchan: the channel
+ *
+ *     Sets the value pointed to by now to the current time and the value
+ *     pointed to by tsc to the tsc associated with that time, if the 
+ *     platform supports TSC.
+ */
+static inline void 
+get_timestamp(struct timeval *now,
+             u32 *tsc,
+             struct rchan *rchan)
+{
+       do_gettimeofday(now);
+
+       if ((using_tsc(rchan) == 1) && cpu_has_tsc)
+               rdtscl(*tsc);
+}
+
+/**
+ *     get_time_or_tsc - utility function for getting a time or a TSC
+ *     @now: current time
+ *     @tsc: current TSC
+ *     @rchan: the channel
+ *
+ *     Sets the value pointed to by now to the current time or the value
+ *     pointed to by tsc to the current tsc, depending on whether we're
+ *     using TSCs or not.
+ */
+static inline void 
+get_time_or_tsc(struct timeval *now,
+               u32 *tsc,
+               struct rchan *rchan)
+{
+       if ((using_tsc(rchan) == 1) && cpu_has_tsc)
+               rdtscl(*tsc);
+       else
+               do_gettimeofday(now);
+}
+
+/**
+ *     have_tsc - does this platform have a useable TSC?
+ *
+ *     Returns 1 if this platform has a useable TSC counter for
+ *     timestamping purposes, 0 otherwise.
+ */
+static inline int
+have_tsc(void)
+{
+       if (cpu_has_tsc)
+               return 1;
+       else
+               return 0;
+}
+
+#else /* No TSC support (#ifdef CONFIG_X86_TSC) */
+#include <asm-generic/relay.h>
+#endif /* #ifdef CONFIG_X86_TSC */
+#endif
diff --git a/include/asm-ia64/relay.h b/include/asm-ia64/relay.h
new file mode 100644 (file)
index 0000000..1d7628e
--- /dev/null
@@ -0,0 +1,5 @@
+#ifndef _ASM_IA64_RELAY_H
+#define _ASM_IA64_RELAY_H
+
+#include <asm-generic/relay.h>
+#endif
diff --git a/include/asm-m68k/relay.h b/include/asm-m68k/relay.h
new file mode 100644 (file)
index 0000000..ec637ff
--- /dev/null
@@ -0,0 +1,5 @@
+#ifndef _ASM_M68K_RELAY_H
+#define _ASM_M68K_RELAY_H
+
+#include <asm-generic/relay.h>
+#endif
diff --git a/include/asm-m68knommu/relay.h b/include/asm-m68knommu/relay.h
new file mode 100644 (file)
index 0000000..ef1afa6
--- /dev/null
@@ -0,0 +1,5 @@
+#ifndef _ASM_M68KNOMMU_RELAY_H
+#define _ASM_M68KNOMMU_RELAY_H
+
+#include <asm-generic/relay.h>
+#endif
diff --git a/include/asm-mips/relay.h b/include/asm-mips/relay.h
new file mode 100644 (file)
index 0000000..37304bd
--- /dev/null
@@ -0,0 +1,5 @@
+#ifndef _ASM_RELAY_H
+#define _ASM_RELAY_H
+
+#include <asm-generic/relay.h>
+#endif
diff --git a/include/asm-mips64/relay.h b/include/asm-mips64/relay.h
new file mode 100644 (file)
index 0000000..37304bd
--- /dev/null
@@ -0,0 +1,5 @@
+#ifndef _ASM_RELAY_H
+#define _ASM_RELAY_H
+
+#include <asm-generic/relay.h>
+#endif
diff --git a/include/asm-parisc/relay.h b/include/asm-parisc/relay.h
new file mode 100644 (file)
index 0000000..cea0c77
--- /dev/null
@@ -0,0 +1,5 @@
+#ifndef _ASM_PARISC_RELAY_H
+#define _ASM_PARISC_RELAY_H
+
+#include <asm-generic/relay.h>
+#endif
diff --git a/include/asm-ppc/relay.h b/include/asm-ppc/relay.h
new file mode 100644 (file)
index 0000000..c5b8526
--- /dev/null
@@ -0,0 +1,5 @@
+#ifndef _ASM_PPC_RELAY_H
+#define _ASM_PPC_RELAY_H
+
+#include <asm-generic/relay.h>
+#endif
diff --git a/include/asm-ppc64/relay.h b/include/asm-ppc64/relay.h
new file mode 100644 (file)
index 0000000..3c428ef
--- /dev/null
@@ -0,0 +1,5 @@
+#ifndef _ASM_PPC64_RELAY_H
+#define _ASM_PPC64_RELAY_H
+
+#include <asm-generic/relay.h>
+#endif
diff --git a/include/asm-s390/relay.h b/include/asm-s390/relay.h
new file mode 100644 (file)
index 0000000..502eb3b
--- /dev/null
@@ -0,0 +1,5 @@
+#ifndef _ASM_S390_RELAY_H
+#define _ASM_S390_RELAY_H
+
+#include <asm-generic/relay.h>
+#endif
diff --git a/include/asm-sh/relay.h b/include/asm-sh/relay.h
new file mode 100644 (file)
index 0000000..fd8b764
--- /dev/null
@@ -0,0 +1,5 @@
+#ifndef _ASM_SH_RELAY_H
+#define _ASM_SH_RELAY_H
+
+#include <asm-generic/relay.h>
+#endif
diff --git a/include/asm-sparc/relay.h b/include/asm-sparc/relay.h
new file mode 100644 (file)
index 0000000..2141eac
--- /dev/null
@@ -0,0 +1,5 @@
+#ifndef _ASM_SPARC_RELAY_H
+#define _ASM_SPARC_RELAY_H
+
+#include <asm-generic/relay.h>
+#endif
diff --git a/include/asm-sparc64/relay.h b/include/asm-sparc64/relay.h
new file mode 100644 (file)
index 0000000..72ea164
--- /dev/null
@@ -0,0 +1,5 @@
+#ifndef _ASM_SPARC64_RELAY_H
+#define _ASM_SPARC64_RELAY_H
+
+#include <asm-generic/relay.h>
+#endif
diff --git a/include/asm-v850/relay.h b/include/asm-v850/relay.h
new file mode 100644 (file)
index 0000000..869a538
--- /dev/null
@@ -0,0 +1,5 @@
+#ifndef __V850_RELAY_H
+#define __V850_RELAY_H
+
+#include <asm-generic/relay.h>
+#endif
diff --git a/include/asm-x86_64/relay.h b/include/asm-x86_64/relay.h
new file mode 100644 (file)
index 0000000..d8b1b88
--- /dev/null
@@ -0,0 +1,5 @@
+#ifndef _ASM_X86_64_RELAY_H
+#define _ASM_X86_64_RELAY_H
+
+#include <asm-generic/relay.h>
+#endif
diff --git a/include/linux/ckrm.h b/include/linux/ckrm.h
new file mode 100644 (file)
index 0000000..99ab97e
--- /dev/null
@@ -0,0 +1,156 @@
+/* ckrm.h - Class-based Kernel Resource Management (CKRM)
+ *
+ * Copyright (C) Hubertus Franke, IBM Corp. 2003,2004
+ *           (C) Shailabh Nagar,  IBM Corp. 2003
+ *           (C) Chandra Seetharaman, IBM Corp. 2003
+ * 
+ * 
+ * Provides a base header file including macros and basic data structures.
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Changes
+ *
+ * 28 Aug 2003
+ *        Created.
+ * 06 Nov 2003
+ *        Made modifications to suit the new RBCE module.
+ * 10 Nov 2003
+ *        Added callbacks_active and surrounding logic. Added task paramter
+ *        for all CE callbacks.
+ * 19 Nov 2004
+ *        New Event callback structure
+ */
+
+#ifndef _LINUX_CKRM_H
+#define _LINUX_CKRM_H
+
+#ifdef CONFIG_CKRM
+
+// Data structure and function to get the list of registered 
+// resource controllers.
+
+// #include <linux/sched.h>
+
+/* CKRM defines a set of events at particular points in the kernel
+ * at which callbacks registered by various class types are called
+ */
+
+enum ckrm_event {
+       /* we distinguish various events types
+         *
+        * (a) CKRM_LATCHABLE_EVENTS
+         *      events can be latched for event callbacks by classtypes
+         *
+        * (b) CKRM_NONLATACHBLE_EVENTS
+         *     events can not be latched but can be used to call classification
+         * 
+        * (c) event that are used for notification purposes
+        *     range: [ CKRM_EVENT_CANNOT_CLASSIFY .. )
+         */
+
+       /* events (a) */
+
+       CKRM_LATCHABLE_EVENTS,
+
+       CKRM_EVENT_NEWTASK = CKRM_LATCHABLE_EVENTS,
+       CKRM_EVENT_FORK,
+       CKRM_EVENT_EXIT,
+       CKRM_EVENT_EXEC,
+       CKRM_EVENT_UID,
+       CKRM_EVENT_GID,
+       CKRM_EVENT_LOGIN,
+       CKRM_EVENT_USERADD,
+       CKRM_EVENT_USERDEL,
+       CKRM_EVENT_LISTEN_START,
+       CKRM_EVENT_LISTEN_STOP,
+       CKRM_EVENT_APPTAG,
+
+       /* events (b) */
+
+       CKRM_NONLATCHABLE_EVENTS,
+
+       CKRM_EVENT_RECLASSIFY = CKRM_NONLATCHABLE_EVENTS,
+
+       /* events (c) */
+       CKRM_NOTCLASSIFY_EVENTS,
+
+       CKRM_EVENT_MANUAL = CKRM_NOTCLASSIFY_EVENTS,
+       
+       CKRM_NUM_EVENTS
+};
+#endif
+
+#ifdef __KERNEL__
+#ifdef CONFIG_CKRM
+
+extern void ckrm_invoke_event_cb_chain(enum ckrm_event ev, void *arg);
+
+typedef void (*ckrm_event_cb)(void *arg);
+
+struct ckrm_hook_cb {
+       ckrm_event_cb fct;
+       struct ckrm_hook_cb *next;
+};
+
+#define CKRM_DEF_CB(EV,fct)                                    \
+static inline void ckrm_cb_##fct(void)                         \
+{                                                              \
+         ckrm_invoke_event_cb_chain(CKRM_EVENT_##EV,NULL);      \
+}
+
+#define CKRM_DEF_CB_ARG(EV,fct,argtp)                                  \
+static inline void ckrm_cb_##fct(argtp arg)                            \
+{                                                                      \
+         ckrm_invoke_event_cb_chain(CKRM_EVENT_##EV,(void*)arg);       \
+}
+
+#else // !CONFIG_CKRM
+
+#define CKRM_DEF_CB(EV,fct)                    \
+static inline void ckrm_cb_##fct(void)  { }
+
+#define CKRM_DEF_CB_ARG(EV,fct,argtp)          \
+static inline void ckrm_cb_##fct(argtp arg) { }
+
+#endif // CONFIG_CKRM
+
+/*-----------------------------------------------------------------
+ *   define the CKRM event functions 
+ *               EVENT          FCT           ARG         
+ *-----------------------------------------------------------------*/
+
+// types we refer at 
+struct task_struct;
+struct sock;
+struct user_struct;
+
+CKRM_DEF_CB_ARG( FORK         , fork,         struct task_struct *);
+CKRM_DEF_CB_ARG( EXEC         , exec,         const char*         );
+CKRM_DEF_CB    ( UID          , uid                               );
+CKRM_DEF_CB    ( GID          , gid                               );
+CKRM_DEF_CB    ( APPTAG       , apptag                            );
+CKRM_DEF_CB    ( LOGIN        , login                             );
+CKRM_DEF_CB_ARG( USERADD      , useradd,      struct user_struct *);
+CKRM_DEF_CB_ARG( USERDEL      , userdel,      struct user_struct *);
+CKRM_DEF_CB_ARG( LISTEN_START , listen_start, struct sock *       );
+CKRM_DEF_CB_ARG( LISTEN_STOP  , listen_stop,  struct sock *       );
+
+// and a few special one's
+void ckrm_cb_newtask(struct task_struct *);
+void ckrm_cb_exit(struct task_struct *);
+
+// some other functions required
+extern void ckrm_init(void);
+extern int get_exe_path_name(struct task_struct *, char *, int);
+
+#endif // __KERNEL__
+
+#endif // _LINUX_CKRM_H
diff --git a/include/linux/ckrm_ce.h b/include/linux/ckrm_ce.h
new file mode 100644 (file)
index 0000000..0bde15d
--- /dev/null
@@ -0,0 +1,91 @@
+/* ckrm_ce.h - Header file to be used by Classification Engine of CKRM
+ *
+ * Copyright (C) Hubertus Franke, IBM Corp. 2003
+ *           (C) Shailabh Nagar,  IBM Corp. 2003
+ *           (C) Chandra Seetharaman, IBM Corp. 2003
+ * 
+ * Provides data structures, macros and kernel API of CKRM for 
+ * classification engine.
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Changes
+ *
+ * 12 Nov 2003
+ *        Created.
+ * 22 Apr 2004
+ *        Adopted to classtypes
+ */
+
+#ifndef _LINUX_CKRM_CE_H
+#define _LINUX_CKRM_CE_H
+
+#ifdef CONFIG_CKRM
+
+#include "ckrm.h"  // getting the event names
+
+/* Action parameters identifying the cause of a task<->class notify callback 
+ * these can perculate up to user daemon consuming records send by the classification
+ * engine
+ */
+
+#ifdef __KERNEL__
+
+typedef void* (*ce_classify_fct_t)(enum ckrm_event event, void *obj, ... );   
+typedef void  (*ce_notify_fct_t)  (enum ckrm_event event, void *classobj, void *obj);
+
+typedef struct ckrm_eng_callback {
+       /* general state information */
+       int  always_callback;  /* set if CE should always be called back regardless of numclasses */
+
+       /* callbacks which are called without holding locks */
+
+       unsigned long c_interest;         /* set of classification events CE is interested in */
+       ce_classify_fct_t   classify;     /* generic classify */
+
+       void   (*class_add)   (const char *name, void *core); /* class added */
+       void   (*class_delete)(const char *name, void *core); /* class deleted */
+
+       /* callback which are called while holding task_lock(tsk) */
+       unsigned long n_interest;         /* set of notification events CE is interested in */
+       ce_notify_fct_t     notify;       /* notify on class switch */
+
+} ckrm_eng_callback_t;
+
+struct inode;
+struct dentry; 
+
+typedef struct rbce_eng_callback {
+       int (*mkdir)(struct inode *, struct dentry *, int); // mkdir
+       int (*rmdir)(struct inode *, struct dentry *); // rmdir
+} rbce_eng_callback_t;
+
+extern int ckrm_register_engine  (const char *name, ckrm_eng_callback_t *);
+extern int ckrm_unregister_engine(const char *name);
+
+extern void *ckrm_classobj(char *, int *classtype);
+extern int get_exe_path_name(struct task_struct *t, char *filename, int max_size);
+
+extern int rcfs_register_engine(rbce_eng_callback_t *);
+extern int rcfs_unregister_engine(rbce_eng_callback_t *);
+
+extern int ckrm_reclassify(int pid);
+
+#ifndef _LINUX_CKRM_RC_H
+// ckrm kernel has inlined functions for this which are exported
+extern void ckrm_core_grab(void *);
+extern void ckrm_core_drop(void *);
+#endif
+
+#endif // CONFIG_CKRM
+
+#endif // __KERNEL__
+
+#endif // _LINUX_CKRM_CE_H
diff --git a/include/linux/ckrm_net.h b/include/linux/ckrm_net.h
new file mode 100644 (file)
index 0000000..0cbf784
--- /dev/null
@@ -0,0 +1,41 @@
+/* ckrm_rc.h - Header file to be used by Resource controllers of CKRM
+ *
+ * Copyright (C) Vivek Kashyap , IBM Corp. 2004
+ * 
+ * Provides data structures, macros and kernel API of CKRM for 
+ * resource controllers.
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#ifndef _LINUX_CKRM_NET_H
+#define _LINUX_CKRM_NET_H
+
+struct ckrm_sock_class;
+
+struct ckrm_net_struct {
+       int              ns_type;                    // type of net class
+       struct sock     *ns_sk;         // pointer to socket
+       pid_t            ns_tgid;       // real process id
+       pid_t            ns_pid;        // calling thread's pid
+       int              ns_family;     // IPPROTO_IPV4 || IPPROTO_IPV6
+                                       // Currently only IPV4 is supported
+       union {
+               __u32   ns_dipv4;       // V4 listener's address
+       } ns_daddr;
+       __u16           ns_dport;       // listener's port
+       __u16 ns_sport;                 // sender's port
+       atomic_t ns_refcnt;
+       struct ckrm_sock_class  *core;          
+       struct list_head       ckrm_link;
+};
+
+#define ns_daddrv4     ns_daddr.ns_dipv4
+
+#endif
diff --git a/include/linux/ckrm_rc.h b/include/linux/ckrm_rc.h
new file mode 100644 (file)
index 0000000..e514f1c
--- /dev/null
@@ -0,0 +1,367 @@
+/* ckrm_rc.h - Header file to be used by Resource controllers of CKRM
+ *
+ * Copyright (C) Hubertus Franke, IBM Corp. 2003
+ *           (C) Shailabh Nagar,  IBM Corp. 2003
+ *           (C) Chandra Seetharaman, IBM Corp. 2003
+ *          (C) Vivek Kashyap , IBM Corp. 2004
+ * 
+ * Provides data structures, macros and kernel API of CKRM for 
+ * resource controllers.
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Changes
+ *
+ * 12 Nov 2003
+ *        Created.
+ */
+
+#ifndef _LINUX_CKRM_RC_H
+#define _LINUX_CKRM_RC_H
+
+#ifdef __KERNEL__
+
+#ifdef CONFIG_CKRM
+
+#include <linux/list.h>
+#include <linux/ckrm.h>
+#include <linux/ckrm_ce.h>    
+#include <linux/seq_file.h>
+
+
+/* maximum number of class types */
+#define CKRM_MAX_CLASSTYPES         32       
+/* maximum classtype name length */
+#define CKRM_MAX_CLASSTYPE_NAME     32       
+
+/* maximum resource controllers per classtype */
+#define CKRM_MAX_RES_CTLRS           8     
+/* maximum resource controller name length */
+#define CKRM_MAX_RES_NAME          128       
+
+
+struct ckrm_core_class;
+struct ckrm_classtype;
+
+/********************************************************************************
+ * Share specifications
+ *******************************************************************************/
+
+typedef struct ckrm_shares {
+       int my_guarantee;
+       int my_limit;
+       int total_guarantee;
+       int max_limit;
+       int unused_guarantee;  // not used as parameters
+       int cur_max_limit;     // not used as parameters
+} ckrm_shares_t;
+
+#define CKRM_SHARE_UNCHANGED     (-1)  // value to indicate no change
+#define CKRM_SHARE_DONTCARE      (-2)  // value to indicate don't care.
+#define CKRM_SHARE_DFLT_TOTAL_GUARANTEE (100) // Start off with these values
+#define CKRM_SHARE_DFLT_MAX_LIMIT     (100) // to simplify set_res_shares logic
+
+
+/********************************************************************************
+ * RESOURCE CONTROLLERS
+ *******************************************************************************/
+
+/* resource controller callback structure */
+
+typedef struct ckrm_res_ctlr {
+       char res_name[CKRM_MAX_RES_NAME];
+       int  res_hdepth;                  // maximum hierarchy
+       int  resid;                       // (for now) same as the enum resid
+       struct ckrm_classtype *classtype; // classtype owning this resource controller
+
+       /* allocate/free new resource class object for resource controller */
+       void *(*res_alloc)  (struct ckrm_core_class *this, struct ckrm_core_class *parent);
+       void  (*res_free)   (void *);
+
+       /* set/get limits/guarantees for a resource controller class */
+       int  (*set_share_values) (void* , struct ckrm_shares *shares);
+       int  (*get_share_values) (void* , struct ckrm_shares *shares);
+
+       /* statistics and configuration access */
+       int  (*get_stats)    (void* , struct seq_file *);
+       int  (*reset_stats)  (void *);
+       int  (*show_config)  (void* , struct seq_file *);
+       int  (*set_config)   (void* , const char *cfgstr);
+
+       void (*change_resclass)(void *, void *, void *);
+
+} ckrm_res_ctlr_t;
+
+/***************************************************************************************
+ * CKRM_CLASSTYPE
+ *
+ *   A <struct ckrm_classtype> object describes a dimension for CKRM to classify 
+ *   along. I needs to provide methods to create and manipulate class objects in
+ *   this dimension
+ ***************************************************************************************/
+
+/* list of predefined class types, we always recognize */
+#define CKRM_CLASSTYPE_TASK_CLASS    0
+#define CKRM_CLASSTYPE_SOCKET_CLASS 1
+#define CKRM_RESV_CLASSTYPES         2  /* always +1 of last known type */
+
+#define CKRM_MAX_TYPENAME_LEN       32
+
+
+typedef struct ckrm_classtype {
+       /* Hubertus:   Rearrange slots so that they are more cache friendly during access */
+
+       /* resource controllers */
+       spinlock_t        res_ctlrs_lock;        /* protect data below (other than atomics) */
+       int               max_res_ctlrs;         /* maximum number of resource controller allowed */
+       int               max_resid;             /* maximum resid used                      */
+       int               resid_reserved;        /* maximum number of reserved controllers  */
+       long              bit_res_ctlrs;         /* bitmap of resource ID used              */
+       atomic_t          nr_resusers[CKRM_MAX_RES_CTLRS];
+       ckrm_res_ctlr_t*  res_ctlrs[CKRM_MAX_RES_CTLRS];
+
+       /* state about my classes */
+
+       struct ckrm_core_class   *default_class; // pointer to default class
+       struct list_head          classes;       // listhead to link up all classes of this classtype
+       int                       num_classes;    // how many classes do exist
+
+       /* state about my ce interaction */
+       int                       ce_regd;       // Has a CE been registered for this classtype
+       int                       ce_cb_active;  // are callbacks active
+       atomic_t                  ce_nr_users;   // how many transient calls active
+       struct ckrm_eng_callback  ce_callbacks;  // callback engine
+
+       // Begin classtype-rcfs private data. No rcfs/fs specific types used. 
+       int               mfidx;             // Index into genmfdesc array used to initialize
+                                            // mfdesc and mfcount 
+       void              *mfdesc;           // Array of descriptors of root and magic files
+       int               mfcount;           // length of above array 
+       void              *rootde;           // root dentry created by rcfs
+       // End rcfs private data 
+
+       char name[CKRM_MAX_TYPENAME_LEN];    // currently same as mfdesc[0]->name but could be different
+       int  typeID;                           /* unique TypeID                         */
+       int  maxdepth;                         /* maximum depth supported               */
+
+       /* functions to be called on any class type by external API's */
+       struct ckrm_core_class*  (*alloc)(struct ckrm_core_class *parent, const char *name);   /* alloc class instance */
+       int                      (*free) (struct ckrm_core_class *cls);                        /* free  class instance */
+       
+       int                      (*show_members)(struct ckrm_core_class *, struct seq_file *);
+       int                      (*show_stats)  (struct ckrm_core_class *, struct seq_file *);
+       int                      (*show_config) (struct ckrm_core_class *, struct seq_file *);
+       int                      (*show_shares) (struct ckrm_core_class *, struct seq_file *);
+
+       int                      (*reset_stats) (struct ckrm_core_class *, const char *resname, 
+                                                const char *);
+       int                      (*set_config)  (struct ckrm_core_class *, const char *resname,
+                                                const char *cfgstr);
+       int                      (*set_shares)  (struct ckrm_core_class *, const char *resname,
+                                                struct ckrm_shares *shares);
+       int                      (*forced_reclassify)(struct ckrm_core_class *, const char *);
+
+  
+       /* functions to be called on a class type by ckrm internals */
+       void                     (*add_resctrl)(struct ckrm_core_class *, int resid);     // class initialization for new RC
+} ckrm_classtype_t;
+
+/******************************************************************************************
+ * CKRM CORE CLASS
+ *      common part to any class structure (i.e. instance of a classtype)
+ ******************************************************************************************/
+
+/* basic definition of a hierarchy that is to be used by the the CORE classes
+ * and can be used by the resource class objects
+ */
+
+#define CKRM_CORE_MAGIC                0xBADCAFFE
+
+typedef struct ckrm_hnode {
+        struct ckrm_core_class *parent;
+       struct list_head   siblings; /* linked list of siblings */
+       struct list_head   children; /* anchor for children     */
+} ckrm_hnode_t;
+
+typedef struct ckrm_core_class {
+       struct ckrm_classtype *classtype; // what type does this core class belong to
+        void* res_class[CKRM_MAX_RES_CTLRS];                 // pointer to array of resource classes
+       spinlock_t class_lock;             // to protect the list and the array above
+       struct list_head objlist;         // generic list for any object list to be maintained by class
+       struct list_head clslist;         // to link up all classes in a single list type wrt to type
+       struct dentry  *dentry;           // dentry of inode in the RCFS
+       int magic;
+       struct ckrm_hnode  hnode;    // hierarchy
+       rwlock_t hnode_rwlock; // rw_clock protecting the hnode above.
+       atomic_t refcnt;
+       const char *name;
+       int delayed;                      // core deletion delayed because of race conditions
+} ckrm_core_class_t;
+
+/* type coerce between derived class types and ckrm core class type */
+#define class_type(type,coreptr)   container_of(coreptr,type,core)
+#define class_core(clsptr)         (&(clsptr)->core)
+/* locking classes */
+#define class_lock(coreptr)        spin_lock(&(coreptr)->class_lock)
+#define class_unlock(coreptr)      spin_unlock(&(coreptr)->class_lock)
+/* what type is a class of ISA */
+#define class_isa(clsptr)          (class_core(clsptr)->classtype)
+
+
+/******************************************************************************************
+ * OTHER
+ ******************************************************************************************/
+
+#define ckrm_get_res_class(rescls,resid,type)   ((type*)((rescls)->res_class[resid]))
+
+extern int ckrm_register_res_ctlr   (struct ckrm_classtype *, ckrm_res_ctlr_t *);
+extern int ckrm_unregister_res_ctlr (ckrm_res_ctlr_t *);
+
+extern int ckrm_validate_and_grab_core(struct ckrm_core_class *core);
+extern int ckrm_init_core_class(struct ckrm_classtype  *clstype,struct ckrm_core_class *dcore,
+                               struct ckrm_core_class *parent, const char *name);
+extern int ckrm_release_core_class(struct ckrm_core_class *);   // Hubertus .. can disappear after cls del debugging
+extern struct ckrm_res_ctlr *ckrm_resctlr_lookup(struct ckrm_classtype *type, const char *resname);
+
+#if 0
+
+// Hubertus ... need to straighten out all these I don't think we will even call thsie ore are we 
+
+/* interface to the RCFS filesystem */
+extern struct ckrm_core_class *ckrm_alloc_core_class(struct ckrm_core_class *, const char *, int);
+
+// Reclassify the given pid to the given core class by force
+extern void ckrm_forced_reclassify_pid(int, struct ckrm_core_class *);
+
+// Reclassify the given net_struct  to the given core class by force
+extern void ckrm_forced_reclassify_laq(struct ckrm_net_struct *, 
+               struct ckrm_core_class *);
+
+#endif
+
+extern void ckrm_lock_hier(struct ckrm_core_class *);
+extern void ckrm_unlock_hier(struct ckrm_core_class *);
+extern struct ckrm_core_class * ckrm_get_next_child(struct ckrm_core_class *,
+                           struct ckrm_core_class *);
+
+extern void child_guarantee_changed(struct ckrm_shares *, int, int);
+extern void child_maxlimit_changed(struct ckrm_shares *, int);
+extern int  set_shares(struct ckrm_shares *, struct ckrm_shares *, struct ckrm_shares *);
+
+/* classtype registration and lookup */
+extern int ckrm_register_classtype  (struct ckrm_classtype *clstype);
+extern int ckrm_unregister_classtype(struct ckrm_classtype *clstype);
+extern struct ckrm_classtype* ckrm_find_classtype_by_name(const char *name);
+
+/* default functions that can be used in classtypes's function table */
+extern int ckrm_class_show_shares(struct ckrm_core_class *core, struct seq_file *seq);
+extern int ckrm_class_show_stats(struct ckrm_core_class *core, struct seq_file *seq);
+extern int ckrm_class_show_config(struct ckrm_core_class *core, struct seq_file *seq);
+extern int ckrm_class_set_config(struct ckrm_core_class *core, const char *resname, const char *cfgstr);
+extern int ckrm_class_set_shares(struct ckrm_core_class *core, const char *resname, struct ckrm_shares *shares);
+extern int ckrm_class_reset_stats(struct ckrm_core_class *core, const char *resname, const char *unused);
+
+#if 0
+extern void ckrm_ns_hold(struct ckrm_net_struct *);
+extern void ckrm_ns_put(struct ckrm_net_struct *);
+extern void *ckrm_set_rootcore_byname(char *, void *);
+#endif
+
+static inline void ckrm_core_grab(struct ckrm_core_class *core)  
+{ 
+       if (core) atomic_inc(&core->refcnt);
+}
+
+static inline void ckrm_core_drop(struct ckrm_core_class *core) 
+{ 
+       // only make definition available in this context
+       extern void ckrm_free_core_class(struct ckrm_core_class *core);   
+       if (core && (atomic_dec_and_test(&core->refcnt)))
+           ckrm_free_core_class(core);
+}
+
+static inline unsigned int
+ckrm_is_core_valid(ckrm_core_class_t *core)
+{
+       return (core && (core->magic == CKRM_CORE_MAGIC));
+}
+
+// iterate through all associate resource controllers:
+// requires following arguments (ckrm_core_class *cls, 
+//                               ckrm_res_ctrl   *ctlr,
+//                               void            *robj,
+//                               int              bmap)
+#define forall_class_resobjs(cls,rcbs,robj,bmap)                                                                       \
+       for ( bmap=((cls->classtype)->bit_res_ctlrs) ;                                                                  \
+            ({ int rid; ((rid=ffs(bmap)-1) >= 0) &&                                                                    \
+                        (bmap&=~(1<<rid),((rcbs=cls->classtype->res_ctlrs[rid]) && (robj=cls->res_class[rid]))); }) ;  \
+           )
+
+extern struct ckrm_classtype* ckrm_classtypes[]; /* should provide a different interface */
+
+
+/*-----------------------------------------------------------------------------
+ * CKRM event callback specification for the classtypes or resource controllers 
+ *   typically an array is specified using CKRM_EVENT_SPEC terminated with 
+ *   CKRM_EVENT_SPEC_LAST and then that array is registered using
+ *   ckrm_register_event_set.
+ *   Individual registration of event_cb is also possible
+ *-----------------------------------------------------------------------------*/
+
+struct ckrm_event_spec {
+       enum ckrm_event     ev;
+       struct ckrm_hook_cb cb;
+};
+#define CKRM_EVENT_SPEC(EV,FCT) { CKRM_EVENT_##EV, { (ckrm_event_cb)FCT, NULL } }
+
+int ckrm_register_event_set(struct ckrm_event_spec especs[]);
+int ckrm_unregister_event_set(struct ckrm_event_spec especs[]);
+int ckrm_register_event_cb(enum ckrm_event ev, struct ckrm_hook_cb *cb);
+int ckrm_unregister_event_cb(enum ckrm_event ev, struct ckrm_hook_cb *cb);
+
+/******************************************************************************************
+ * CE Invocation interface
+ ******************************************************************************************/
+
+#define ce_protect(ctype)      (atomic_inc(&((ctype)->ce_nr_users)))
+#define ce_release(ctype)      (atomic_dec(&((ctype)->ce_nr_users)))
+
+// CE Classification callbacks with 
+
+#define CE_CLASSIFY_NORET(ctype, event, objs_to_classify...)                                   \
+do {                                                                                           \
+       if ((ctype)->ce_cb_active && (test_bit(event,&(ctype)->ce_callbacks.c_interest)))       \
+               (*(ctype)->ce_callbacks.classify)(event, objs_to_classify);                     \
+} while (0)
+
+#define CE_CLASSIFY_RET(ret, ctype, event, objs_to_classify...)                                        \
+do {                                                                                           \
+       if ((ctype)->ce_cb_active && (test_bit(event,&(ctype)->ce_callbacks.c_interest)))       \
+               ret = (*(ctype)->ce_callbacks.classify)(event, objs_to_classify);               \
+} while (0)
+
+#define CE_NOTIFY(ctype, event, cls, objs_to_classify)                                         \
+do {                                                                                           \
+       if ((ctype)->ce_cb_active && (test_bit(event,&(ctype)->ce_callbacks.n_interest)))       \
+               (*(ctype)->ce_callbacks.notify)(event,cls,objs_to_classify);                    \
+} while (0)
+
+
+#endif // CONFIG_CKRM
+
+#endif // __KERNEL__
+
+#endif // _LINUX_CKRM_RC_H
+
+
+
+
+
diff --git a/include/linux/ckrm_tc.h b/include/linux/ckrm_tc.h
new file mode 100644 (file)
index 0000000..6a57025
--- /dev/null
@@ -0,0 +1,18 @@
+#include <linux/ckrm_rc.h>
+
+
+
+#define TASK_CLASS_TYPE_NAME "taskclass"
+
+typedef struct ckrm_task_class {
+       struct ckrm_core_class core;   
+} ckrm_task_class_t;
+
+
+// Index into genmfdesc array, defined in rcfs/dir_modules.c,
+// which has the mfdesc entry that taskclass wants to use
+#define TC_MF_IDX  0
+
+
+extern int ckrm_forced_reclassify_pid(int pid, struct ckrm_task_class *cls);
+
diff --git a/include/linux/ckrm_tsk.h b/include/linux/ckrm_tsk.h
new file mode 100644 (file)
index 0000000..64d20dd
--- /dev/null
@@ -0,0 +1,41 @@
+/* ckrm_tsk.h - No. of tasks resource controller for CKRM
+ *
+ * Copyright (C) Chandra Seetharaman, IBM Corp. 2003
+ * 
+ * Provides No. of tasks resource controller for CKRM
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Changes
+ *
+ * 31 Mar 2004
+ *    Created.
+ */
+
+#ifndef _LINUX_CKRM_TSK_H
+#define _LINUX_CKRM_TSK_H
+
+#include <linux/ckrm_rc.h>
+
+#ifdef CONFIG_CKRM_RES_NUMTASKS
+
+extern int numtasks_get_ref(void *, int);
+extern int numtasks_get_ref_resid(void *, int, int);
+extern void numtasks_put_ref(void *);
+
+#else
+
+#define numtasks_get_ref(a, b)         1
+#define numtasks_get_ref_resid(a, b, c)                1
+#define numtasks_put_ref(a)
+
+#endif
+
+#endif // _LINUX_CKRM_RES_H
diff --git a/include/linux/klog.h b/include/linux/klog.h
new file mode 100644 (file)
index 0000000..cb79bea
--- /dev/null
@@ -0,0 +1,24 @@
+/*
+ * KLOG                Generic Logging facility built upon the relayfs infrastructure
+ *
+ * Authors:    Hubertus Frankeh  (frankeh@us.ibm.com)
+ *             Tom Zanussi  (zanussi@us.ibm.com)
+ *
+ *             Please direct all questions/comments to zanussi@us.ibm.com
+ *
+ *             Copyright (C) 2003, IBM Corp
+ *
+ *
+ *             This program is free software; you can redistribute it and/or
+ *             modify it under the terms of the GNU General Public License
+ *             as published by the Free Software Foundation; either version
+ *             2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _LINUX_KLOG_H
+#define _LINUX_KLOG_H
+
+extern int klog(const char *fmt, ...);
+extern int klog_raw(const char *buf,int len); 
+
+#endif /* _LINUX_KLOG_H */
diff --git a/include/linux/rcfs.h b/include/linux/rcfs.h
new file mode 100644 (file)
index 0000000..a2a65e8
--- /dev/null
@@ -0,0 +1,98 @@
+#ifndef _LINUX_RCFS_H
+#define _LINUX_RCFS_H
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/ckrm.h>
+#include <linux/ckrm_rc.h>
+#include <linux/ckrm_ce.h>
+
+
+
+/* The following declarations cannot be included in any of ckrm*.h files without 
+   jumping hoops. Remove later when rearrangements done */
+
+// Hubertus .. taken out 
+//extern ckrm_res_callback_t ckrm_res_ctlrs[CKRM_MAX_RES_CTLRS];
+
+#define RCFS_MAGIC     0x4feedbac
+#define RCFS_MAGF_NAMELEN 20
+extern int RCFS_IS_MAGIC;
+
+#define rcfs_is_magic(dentry)  ((dentry)->d_fsdata == &RCFS_IS_MAGIC)
+
+typedef struct rcfs_inode_info {
+       ckrm_core_class_t *core;
+       char *name;
+       struct inode vfs_inode;
+} rcfs_inode_info_t;
+
+#define RCFS_DEFAULT_DIR_MODE  (S_IFDIR | S_IRUGO | S_IXUGO)
+#define RCFS_DEFAULT_FILE_MODE (S_IFREG | S_IRUSR | S_IWUSR | S_IRGRP |S_IROTH)
+
+
+struct rcfs_magf {
+       char name[RCFS_MAGF_NAMELEN];
+       int mode;
+       struct inode_operations *i_op;
+       struct file_operations *i_fop;
+};
+
+struct rcfs_mfdesc {
+       struct rcfs_magf *rootmf;     // Root directory and its magic files
+       int              rootmflen;   // length of above array
+       // Can have a different magf describing magic files for non-root entries too
+};
+
+extern struct rcfs_mfdesc *genmfdesc[];
+
+inline struct rcfs_inode_info *RCFS_I(struct inode *inode);
+
+int rcfs_empty(struct dentry *);
+struct inode *rcfs_get_inode(struct super_block *, int, dev_t);
+int rcfs_mknod(struct inode *, struct dentry *, int, dev_t);
+int _rcfs_mknod(struct inode *, struct dentry *, int , dev_t);
+int rcfs_mkdir(struct inode *, struct dentry *, int);
+ckrm_core_class_t *rcfs_make_core(struct dentry *, struct ckrm_core_class *);
+struct dentry *rcfs_set_magf_byname(char *, void *);
+
+struct dentry * rcfs_create_internal(struct dentry *, struct rcfs_magf *, int);
+int rcfs_delete_internal(struct dentry *);
+int rcfs_create_magic(struct dentry *, struct rcfs_magf *, int);
+int rcfs_clear_magic(struct dentry *);
+
+
+extern struct super_operations rcfs_super_ops;
+extern struct address_space_operations rcfs_aops;
+
+extern struct inode_operations rcfs_dir_inode_operations;
+extern struct inode_operations rcfs_rootdir_inode_operations;
+extern struct inode_operations rcfs_file_inode_operations;
+
+
+extern struct file_operations target_fileops;
+extern struct file_operations shares_fileops;
+extern struct file_operations stats_fileops;
+extern struct file_operations config_fileops;
+extern struct file_operations members_fileops;
+extern struct file_operations rcfs_file_operations;
+
+// Callbacks into rcfs from ckrm 
+
+typedef struct rcfs_functions {
+       int  (* mkroot)(struct rcfs_magf *,int, struct dentry **);
+       int  (* rmroot)(struct dentry *);
+       int  (* register_classtype)(ckrm_classtype_t *);
+       int  (* deregister_classtype)(ckrm_classtype_t *);
+} rcfs_fn_t;
+
+int rcfs_register_classtype(ckrm_classtype_t *);
+int rcfs_deregister_classtype(ckrm_classtype_t *);
+int rcfs_mkroot(struct rcfs_magf *, int , struct dentry **);
+int rcfs_rmroot(struct dentry *);
+
+#define RCFS_ROOT "/rcfs"         // Hubertus .. we should use the mount point instead of hardcoded
+extern struct dentry *rcfs_rootde;
+
+
+#endif /* _LINUX_RCFS_H */ 
diff --git a/include/linux/relayfs_fs.h b/include/linux/relayfs_fs.h
new file mode 100644 (file)
index 0000000..2c52874
--- /dev/null
@@ -0,0 +1,686 @@
+/*
+ * linux/include/linux/relayfs_fs.h
+ *
+ * Copyright (C) 2002, 2003 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
+ * Copyright (C) 1999, 2000, 2001, 2002 - Karim Yaghmour (karim@opersys.com)
+ *
+ * RelayFS definitions and declarations
+ *
+ * Please see Documentation/filesystems/relayfs.txt for more info.
+ */
+
+#ifndef _LINUX_RELAYFS_FS_H
+#define _LINUX_RELAYFS_FS_H
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/fs.h>
+
+/*
+ * Tracks changes to rchan struct
+ */
+#define RELAYFS_CHANNEL_VERSION                1
+
+/*
+ * Maximum number of simultaneously open channels
+ */
+#define RELAY_MAX_CHANNELS             256
+
+/*
+ * Relay properties
+ */
+#define RELAY_MIN_BUFS                 2
+#define RELAY_MIN_BUFSIZE              4096
+#define RELAY_MAX_BUFS                 256
+#define RELAY_MAX_BUF_SIZE             0x1000000
+#define RELAY_MAX_TOTAL_BUF_SIZE       0x8000000
+
+/*
+ * Lockless scheme utility macros
+ */
+#define RELAY_MAX_BUFNO(bufno_bits) (1UL << (bufno_bits))
+#define RELAY_BUF_SIZE(offset_bits) (1UL << (offset_bits))
+#define RELAY_BUF_OFFSET_MASK(offset_bits) (RELAY_BUF_SIZE(offset_bits) - 1)
+#define RELAY_BUFNO_GET(index, offset_bits) ((index) >> (offset_bits))
+#define RELAY_BUF_OFFSET_GET(index, mask) ((index) & (mask))
+#define RELAY_BUF_OFFSET_CLEAR(index, mask) ((index) & ~(mask))
+
+/*
+ * Flags returned by relay_reserve()
+ */
+#define RELAY_BUFFER_SWITCH_NONE       0x0
+#define RELAY_WRITE_DISCARD_NONE       0x0
+#define RELAY_BUFFER_SWITCH            0x1
+#define RELAY_WRITE_DISCARD            0x2
+#define RELAY_WRITE_TOO_LONG           0x4
+
+/*
+ * Relay attribute flags
+ */
+#define RELAY_DELIVERY_BULK            0x1
+#define RELAY_DELIVERY_PACKET          0x2
+#define RELAY_SCHEME_LOCKLESS          0x4
+#define RELAY_SCHEME_LOCKING           0x8
+#define RELAY_SCHEME_ANY               0xC
+#define RELAY_TIMESTAMP_TSC            0x10
+#define RELAY_TIMESTAMP_GETTIMEOFDAY   0x20
+#define RELAY_TIMESTAMP_ANY            0x30
+#define RELAY_USAGE_SMP                        0x40
+#define RELAY_USAGE_GLOBAL             0x80
+#define RELAY_MODE_CONTINUOUS          0x100
+#define RELAY_MODE_NO_OVERWRITE                0x200
+
+/*
+ * Flags for needs_resize() callback
+ */
+#define RELAY_RESIZE_NONE      0x0
+#define RELAY_RESIZE_EXPAND    0x1
+#define RELAY_RESIZE_SHRINK    0x2
+#define RELAY_RESIZE_REPLACE   0x4
+#define RELAY_RESIZE_REPLACED  0x8
+
+/*
+ * Values for fileop_notify() callback
+ */
+enum relay_fileop
+{
+       RELAY_FILE_OPEN,
+       RELAY_FILE_CLOSE,
+       RELAY_FILE_MAP,
+       RELAY_FILE_UNMAP
+};
+
+/*
+ * Data structure returned by relay_info()
+ */
+struct rchan_info
+{
+       u32 flags;              /* relay attribute flags for channel */
+       u32 buf_size;           /* channel's sub-buffer size */
+       char *buf_addr;         /* address of channel start */
+       u32 alloc_size;         /* total buffer size actually allocated */
+       u32 n_bufs;             /* number of sub-buffers in channel */
+       u32 cur_idx;            /* current write index into channel */
+       u32 bufs_produced;      /* current count of sub-buffers produced */
+       u32 bufs_consumed;      /* current count of sub-buffers consumed */
+       u32 buf_id;             /* buf_id of current sub-buffer */
+       int buffer_complete[RELAY_MAX_BUFS];    /* boolean per sub-buffer */
+       int unused_bytes[RELAY_MAX_BUFS];       /* count per sub-buffer */
+};
+
+/*
+ * Relay channel client callbacks
+ */
+struct rchan_callbacks
+{
+       /*
+        * buffer_start - called at the beginning of a new sub-buffer
+        * @rchan_id: the channel id
+        * @current_write_pos: position in sub-buffer client should write to
+        * @buffer_id: the id of the new sub-buffer
+        * @start_time: the timestamp associated with the start of sub-buffer
+        * @start_tsc: the TSC associated with the timestamp, if using_tsc
+        * @using_tsc: boolean, indicates whether start_tsc is valid
+        *
+        * Return value should be the number of bytes written by the client.
+        *
+        * See Documentation/filesystems/relayfs.txt for details.
+        */
+       int (*buffer_start) (int rchan_id,
+                            char *current_write_pos,
+                            u32 buffer_id,
+                            struct timeval start_time,
+                            u32 start_tsc,
+                            int using_tsc);
+
+       /*
+        * buffer_end - called at the end of a sub-buffer
+        * @rchan_id: the channel id
+        * @current_write_pos: position in sub-buffer of end of data
+        * @end_of_buffer: the position of the end of the sub-buffer
+        * @end_time: the timestamp associated with the end of the sub-buffer
+        * @end_tsc: the TSC associated with the end_time, if using_tsc
+        * @using_tsc: boolean, indicates whether end_tsc is valid
+        *
+        * Return value should be the number of bytes written by the client.
+        *
+        * See Documentation/filesystems/relayfs.txt for details.
+        */
+       int (*buffer_end) (int rchan_id,
+                          char *current_write_pos,
+                          char *end_of_buffer,
+                          struct timeval end_time,
+                          u32 end_tsc,
+                          int using_tsc);
+
+       /*
+        * deliver - called when data is ready for the client
+        * @rchan_id: the channel id
+        * @from: the start of the delivered data
+        * @len: the length of the delivered data
+        *
+        * See Documentation/filesystems/relayfs.txt for details.
+        */
+       void (*deliver) (int rchan_id, char *from, u32 len);
+
+       /*
+        * user_deliver - called when data has been written from userspace
+        * @rchan_id: the channel id
+        * @from: the start of the delivered data
+        * @len: the length of the delivered data
+        *
+        * See Documentation/filesystems/relayfs.txt for details.
+        */
+       void (*user_deliver) (int rchan_id, char *from, u32 len);
+
+       /*
+        * needs_resize - called when a resizing event occurs
+        * @rchan_id: the channel id
+        * @resize_type: the type of resizing event
+        * @suggested_buf_size: the suggested new sub-buffer size
+        * @suggested_buf_size: the suggested new number of sub-buffers
+        *
+        * See Documentation/filesystems/relayfs.txt for details.
+        */
+       void (*needs_resize)(int rchan_id,
+                            int resize_type,
+                            u32 suggested_buf_size,
+                            u32 suggested_n_bufs);
+
+       /*
+        * fileop_notify - called on open/close/mmap/munmap of a relayfs file
+        * @rchan_id: the channel id
+        * @filp: relayfs file pointer
+        * @fileop: which file operation is in progress
+        *
+        * The return value can direct the outcome of the operation.
+        *
+        * See Documentation/filesystems/relayfs.txt for details.
+        */
+        int (*fileop_notify)(int rchan_id,
+                            struct file *filp,
+                            enum relay_fileop fileop);
+
+       /*
+        * ioctl - called in ioctl context from userspace
+        * @rchan_id: the channel id
+        * @cmd: ioctl cmd
+        * @arg: ioctl cmd arg
+        *
+        * The return value is returned as the value from the ioctl call.
+        *
+        * See Documentation/filesystems/relayfs.txt for details.
+        */
+       int (*ioctl) (int rchan_id, unsigned int cmd, unsigned long arg);
+};
+
+/*
+ * Lockless scheme-specific data
+ */
+struct lockless_rchan
+{
+       u8 bufno_bits;          /* # bits used for sub-buffer id */
+       u8 offset_bits;         /* # bits used for offset within sub-buffer */
+       u32 index;              /* current index = sub-buffer id and offset */
+       u32 offset_mask;        /* used to obtain offset portion of index */
+       u32 index_mask;         /* used to mask off unused bits index */
+       atomic_t fill_count[RELAY_MAX_BUFS];    /* fill count per sub-buffer */
+};
+
+/*
+ * Locking scheme-specific data
+ */
+struct locking_rchan
+{
+       char *write_buf;                /* start of write sub-buffer */
+       char *write_buf_end;            /* end of write sub-buffer */
+       char *current_write_pos;        /* current write pointer */
+       char *write_limit;              /* takes reserves into account */
+       char *in_progress_event_pos;    /* used for interrupted writes */
+       u16 in_progress_event_size;     /* used for interrupted writes */
+       char *interrupted_pos;          /* used for interrupted writes */
+       u16 interrupting_size;          /* used for interrupted writes */
+       spinlock_t lock;                /* channel lock for locking scheme */
+};
+
+struct relay_ops;
+
+/*
+ * Offset resizing data structure
+ */
+struct resize_offset
+{
+       u32 ge;
+       u32 le;
+       int delta;
+};
+
+/*
+ * Relay channel data structure
+ */
+struct rchan
+{
+       u32 version;                    /* the version of this struct */
+       char *buf;                      /* the channel buffer */
+       union
+       {
+               struct lockless_rchan lockless;
+               struct locking_rchan locking;
+       } scheme;                       /* scheme-specific channel data */
+
+       int id;                         /* the channel id */
+       struct rchan_callbacks *callbacks;      /* client callbacks */
+       u32 flags;                      /* relay channel attributes */
+       u32 buf_id;                     /* current sub-buffer id */
+       u32 buf_idx;                    /* current sub-buffer index */
+
+       atomic_t mapped;                /* map count */
+
+       atomic_t suspended;             /* channel suspended i.e full? */
+       int half_switch;                /* used internally for suspend */
+
+       struct timeval  buf_start_time; /* current sub-buffer start time */
+       u32 buf_start_tsc;              /* current sub-buffer start TSC */
+       
+       u32 buf_size;                   /* sub-buffer size */
+       u32 alloc_size;                 /* total buffer size allocated */
+       u32 n_bufs;                     /* number of sub-buffers */
+
+       u32 bufs_produced;              /* count of sub-buffers produced */
+       u32 bufs_consumed;              /* count of sub-buffers consumed */
+       u32 bytes_consumed;             /* bytes consumed in cur sub-buffer */
+
+       int initialized;                /* first buffer initialized? */
+       int finalized;                  /* channel finalized? */
+
+       u32 start_reserve;              /* reserve at start of sub-buffers */
+       u32 end_reserve;                /* reserve at end of sub-buffers */
+       u32 rchan_start_reserve;        /* additional reserve sub-buffer 0 */
+       
+       struct dentry *dentry;          /* channel file dentry */
+
+       wait_queue_head_t read_wait;    /* VFS read wait queue */
+       wait_queue_head_t write_wait;   /* VFS write wait queue */
+       struct work_struct wake_readers; /* reader wake-up work struct */
+       struct work_struct wake_writers; /* reader wake-up work struct */
+       atomic_t refcount;              /* channel refcount */
+
+       struct relay_ops *relay_ops;    /* scheme-specific channel ops */
+
+       int unused_bytes[RELAY_MAX_BUFS]; /* unused count per sub-buffer */
+
+       struct semaphore resize_sem;    /* serializes alloc/repace */
+       struct work_struct work;        /* resize allocation work struct */
+
+       struct list_head open_readers;  /* open readers for this channel */
+       rwlock_t open_readers_lock;     /* protection for open_readers list */
+
+       char *init_buf;                 /* init channel buffer, if non-NULL */
+       
+       u32 resize_min;                 /* minimum resized total buffer size */
+       u32 resize_max;                 /* maximum resized total buffer size */
+       char *resize_buf;               /* for autosize alloc/free */
+       u32 resize_buf_size;            /* resized sub-buffer size */
+       u32 resize_n_bufs;              /* resized number of sub-buffers */
+       u32 resize_alloc_size;          /* resized actual total size */
+       int resizing;                   /* is resizing in progress? */
+       int resize_err;                 /* resizing err code */
+       int resize_failures;            /* number of resize failures */
+       int replace_buffer;             /* is the alloced buffer ready?  */
+       struct resize_offset resize_offset; /* offset change */
+       struct timer_list shrink_timer; /* timer used for shrinking */
+       int resize_order;               /* size of last resize */
+       u32 expand_buf_id;              /* subbuf id expand will occur at */
+
+       struct page **buf_page_array;   /* array of current buffer pages */
+       int buf_page_count;             /* number of current buffer pages */
+       struct page **expand_page_array;/* new pages to be inserted */
+       int expand_page_count;          /* number of new pages */
+       struct page **shrink_page_array;/* old pages to be freed */
+       int shrink_page_count;          /* number of old pages */
+       struct page **resize_page_array;/* will become current pages */
+       int resize_page_count;          /* number of resize pages */
+       struct page **old_buf_page_array; /* hold for freeing */
+} ____cacheline_aligned;
+
+/*
+ * Relay channel reader struct
+ */
+struct rchan_reader
+{
+       struct list_head list;          /* for list inclusion */
+       struct rchan *rchan;            /* the channel we're reading from */
+       int auto_consume;               /* does this reader auto-consume? */
+       u32 bufs_consumed;              /* buffers this reader has consumed */
+       u32 bytes_consumed;             /* bytes consumed in cur sub-buffer */
+       int offset_changed;             /* have channel offsets changed? */
+       int vfs_reader;                 /* are we a VFS reader? */
+       int map_reader;                 /* are we an mmap reader? */
+
+       union
+       {
+               struct file *file;
+               u32 f_pos;
+       } pos;                          /* current read offset */
+};
+
+/*
+ * These help make union member access less tedious
+ */
+#define channel_buffer(rchan) ((rchan)->buf)
+#define idx(rchan) ((rchan)->scheme.lockless.index)
+#define bufno_bits(rchan) ((rchan)->scheme.lockless.bufno_bits)
+#define offset_bits(rchan) ((rchan)->scheme.lockless.offset_bits)
+#define offset_mask(rchan) ((rchan)->scheme.lockless.offset_mask)
+#define idx_mask(rchan) ((rchan)->scheme.lockless.index_mask)
+#define bulk_delivery(rchan) (((rchan)->flags & RELAY_DELIVERY_BULK) ? 1 : 0)
+#define packet_delivery(rchan) (((rchan)->flags & RELAY_DELIVERY_PACKET) ? 1 : 0)
+#define using_lockless(rchan) (((rchan)->flags & RELAY_SCHEME_LOCKLESS) ? 1 : 0)
+#define using_locking(rchan) (((rchan)->flags & RELAY_SCHEME_LOCKING) ? 1 : 0)
+#define using_tsc(rchan) (((rchan)->flags & RELAY_TIMESTAMP_TSC) ? 1 : 0)
+#define using_gettimeofday(rchan) (((rchan)->flags & RELAY_TIMESTAMP_GETTIMEOFDAY) ? 1 : 0)
+#define usage_smp(rchan) (((rchan)->flags & RELAY_USAGE_SMP) ? 1 : 0)
+#define usage_global(rchan) (((rchan)->flags & RELAY_USAGE_GLOBAL) ? 1 : 0)
+#define mode_continuous(rchan) (((rchan)->flags & RELAY_MODE_CONTINUOUS) ? 1 : 0)
+#define fill_count(rchan, i) ((rchan)->scheme.lockless.fill_count[(i)])
+#define write_buf(rchan) ((rchan)->scheme.locking.write_buf)
+#define read_buf(rchan) ((rchan)->scheme.locking.read_buf)
+#define write_buf_end(rchan) ((rchan)->scheme.locking.write_buf_end)
+#define read_buf_end(rchan) ((rchan)->scheme.locking.read_buf_end)
+#define cur_write_pos(rchan) ((rchan)->scheme.locking.current_write_pos)
+#define read_limit(rchan) ((rchan)->scheme.locking.read_limit)
+#define write_limit(rchan) ((rchan)->scheme.locking.write_limit)
+#define in_progress_event_pos(rchan) ((rchan)->scheme.locking.in_progress_event_pos)
+#define in_progress_event_size(rchan) ((rchan)->scheme.locking.in_progress_event_size)
+#define interrupted_pos(rchan) ((rchan)->scheme.locking.interrupted_pos)
+#define interrupting_size(rchan) ((rchan)->scheme.locking.interrupting_size)
+#define channel_lock(rchan) ((rchan)->scheme.locking.lock)
+
+
+/**
+ *     calc_time_delta - utility function for time delta calculation
+ *     @now: current time
+ *     @start: start time
+ *
+ *     Returns the time delta produced by subtracting start time from now.
+ */
+static inline u32
+calc_time_delta(struct timeval *now, 
+               struct timeval *start)
+{
+       return (now->tv_sec - start->tv_sec) * 1000000
+               + (now->tv_usec - start->tv_usec);
+}
+
+/**
+ *     recalc_time_delta - utility function for time delta recalculation
+ *     @now: current time
+ *     @new_delta: the new time delta calculated
+ *     @cpu: the associated CPU id
+ */
+static inline void 
+recalc_time_delta(struct timeval *now,
+                 u32 *new_delta,
+                 struct rchan *rchan)
+{
+       if (using_tsc(rchan) == 0)
+               *new_delta = calc_time_delta(now, &rchan->buf_start_time);
+}
+
+/**
+ *     have_cmpxchg - does this architecture have a cmpxchg?
+ *
+ *     Returns 1 if this architecture has a cmpxchg useable by 
+ *     the lockless scheme, 0 otherwise.
+ */
+static inline int 
+have_cmpxchg(void)
+{
+#if defined(__HAVE_ARCH_CMPXCHG)
+       return 1;
+#else
+       return 0;
+#endif
+}
+
+/**
+ *     relay_write_direct - write data directly into destination buffer
+ */
+#define relay_write_direct(DEST, SRC, SIZE) \
+do\
+{\
+   memcpy(DEST, SRC, SIZE);\
+   DEST += SIZE;\
+} while (0);
+
+/**
+ *     relay_lock_channel - lock the relay channel if applicable
+ *
+ *     This macro only affects the locking scheme.  If the locking scheme
+ *     is in use and the channel usage is SMP, does a local_irq_save.  If the 
+ *     locking sheme is in use and the channel usage is GLOBAL, uses 
+ *     spin_lock_irqsave.  FLAGS is initialized to 0 since we know that
+ *     it is being initialized prior to use and we avoid the compiler warning.
+ */
+#define relay_lock_channel(RCHAN, FLAGS) \
+do\
+{\
+   FLAGS = 0;\
+   if (using_locking(RCHAN)) {\
+      if (usage_smp(RCHAN)) {\
+         local_irq_save(FLAGS); \
+      } else {\
+         spin_lock_irqsave(&(RCHAN)->scheme.locking.lock, FLAGS); \
+      }\
+   }\
+} while (0);
+
+/**
+ *     relay_unlock_channel - unlock the relay channel if applicable
+ *
+ *     This macro only affects the locking scheme.  See relay_lock_channel.
+ */
+#define relay_unlock_channel(RCHAN, FLAGS) \
+do\
+{\
+   if (using_locking(RCHAN)) {\
+      if (usage_smp(RCHAN)) {\
+         local_irq_restore(FLAGS); \
+      } else {\
+         spin_unlock_irqrestore(&(RCHAN)->scheme.locking.lock, FLAGS); \
+      }\
+   }\
+} while (0);
+
+/*
+ * Define cmpxchg if we don't have it
+ */
+#ifndef __HAVE_ARCH_CMPXCHG
+#define cmpxchg(p,o,n) 0
+#endif
+
+/*
+ * High-level relayfs kernel API, fs/relayfs/relay.c
+ */
+extern int
+relay_open(const char *chanpath,
+          int bufsize,
+          int nbufs,
+          u32 flags,
+          struct rchan_callbacks *channel_callbacks,
+          u32 start_reserve,
+          u32 end_reserve,
+          u32 rchan_start_reserve,
+          u32 resize_min,
+          u32 resize_max,
+          int mode,
+          char *init_buf,
+          u32 init_buf_size);
+
+extern int
+relay_close(int rchan_id);
+
+extern int
+relay_write(int rchan_id,
+           const void *data_ptr, 
+           size_t count,
+           int td_offset,
+           void **wrote_pos);
+
+extern ssize_t
+relay_read(struct rchan_reader *reader,
+          char *buf,
+          size_t count,
+          int wait,
+          u32 *actual_read_offset);
+
+extern int
+relay_discard_init_buf(int rchan_id);
+
+extern struct rchan_reader *
+add_rchan_reader(int rchan_id, int autoconsume);
+
+extern int
+remove_rchan_reader(struct rchan_reader *reader);
+
+extern struct rchan_reader *
+add_map_reader(int rchan_id);
+
+extern int
+remove_map_reader(struct rchan_reader *reader);
+
+extern int 
+relay_info(int rchan_id, struct rchan_info *rchan_info);
+
+extern void 
+relay_buffers_consumed(struct rchan_reader *reader, u32 buffers_consumed);
+
+extern void
+relay_bytes_consumed(struct rchan_reader *reader, u32 bytes_consumed, u32 read_offset);
+
+extern ssize_t
+relay_bytes_avail(struct rchan_reader *reader);
+
+extern int
+relay_realloc_buffer(int rchan_id, u32 new_nbufs, int in_background);
+
+extern int
+relay_replace_buffer(int rchan_id);
+
+extern int
+rchan_empty(struct rchan_reader *reader);
+
+extern int
+rchan_full(struct rchan_reader *reader);
+
+extern void
+update_readers_consumed(struct rchan *rchan, u32 bufs_consumed, u32 bytes_consumed);
+
+extern int 
+__relay_mmap_buffer(struct rchan *rchan, struct vm_area_struct *vma);
+
+extern struct rchan_reader *
+__add_rchan_reader(struct rchan *rchan, struct file *filp, int auto_consume, int map_reader);
+
+extern void
+__remove_rchan_reader(struct rchan_reader *reader);
+
+/*
+ * Low-level relayfs kernel API, fs/relayfs/relay.c
+ */
+extern struct rchan *
+rchan_get(int rchan_id);
+
+extern void
+rchan_put(struct rchan *rchan);
+
+extern char *
+relay_reserve(struct rchan *rchan,
+             u32 data_len,
+             struct timeval *time_stamp,
+             u32 *time_delta,
+             int *errcode,
+             int *interrupting);
+
+extern void 
+relay_commit(struct rchan *rchan,
+            char *from, 
+            u32 len, 
+            int reserve_code,
+            int interrupting);
+
+extern u32 
+relay_get_offset(struct rchan *rchan, u32 *max_offset);
+
+extern int
+relay_reset(int rchan_id);
+
+/*
+ * VFS functions, fs/relayfs/inode.c
+ */
+extern int 
+relayfs_create_dir(const char *name, 
+                  struct dentry *parent, 
+                  struct dentry **dentry);
+
+extern int
+relayfs_create_file(const char * name,
+                   struct dentry *parent, 
+                   struct dentry **dentry,
+                   void * data,
+                   int mode);
+
+extern int 
+relayfs_remove_file(struct dentry *dentry);
+
+extern int
+reset_index(struct rchan *rchan, u32 old_index);
+
+
+/*
+ * klog functions, fs/relayfs/klog.c
+ */
+extern int
+create_klog_channel(void);
+
+extern int
+remove_klog_channel(void);
+
+/*
+ * Scheme-specific channel ops
+ */
+struct relay_ops
+{
+       char * (*reserve) (struct rchan *rchan,
+                          u32 slot_len,
+                          struct timeval *time_stamp,
+                          u32 *tsc,
+                          int * errcode,
+                          int * interrupting);
+       
+       void (*commit) (struct rchan *rchan,
+                       char *from,
+                       u32 len, 
+                       int deliver, 
+                       int interrupting);
+
+       u32 (*get_offset) (struct rchan *rchan,
+                          u32 *max_offset);
+       
+       void (*resume) (struct rchan *rchan);
+       void (*finalize) (struct rchan *rchan);
+       void (*reset) (struct rchan *rchan,
+                      int init);
+       int (*reset_index) (struct rchan *rchan,
+                           u32 old_index);
+};
+
+#endif /* _LINUX_RELAYFS_FS_H */
+
+
+
+
+
diff --git a/include/linux/taskdelays.h b/include/linux/taskdelays.h
new file mode 100644 (file)
index 0000000..698b23b
--- /dev/null
@@ -0,0 +1,20 @@
+#ifndef _LINUX_TASKDELAYS_H
+#define _LINUX_TASKDELAYS_H
+
+#include <linux/config.h>
+
+struct task_delay_info {
+#ifdef CONFIG_DELAY_ACCT
+        /* delay statistics in usecs */
+       unsigned long runs;
+       unsigned long waitcpu_total;
+       unsigned long runcpu_total;
+       unsigned long iowait_total;
+       unsigned long mem_iowait_total;
+       unsigned long num_iowaits;
+       unsigned long num_memwaits;
+#endif
+};
+
+#endif // _LINUX_TASKDELAYS_H
+
diff --git a/kernel/ckrm/Makefile b/kernel/ckrm/Makefile
new file mode 100644 (file)
index 0000000..58b9aad
--- /dev/null
@@ -0,0 +1,14 @@
+#
+# Makefile for CKRM 
+#
+
+ifeq ($(CONFIG_CKRM),y)
+       obj-y = ckrm.o ckrmutils.o 
+endif
+
+obj-$(CONFIG_CKRM_TYPE_TASKCLASS) += ckrm_tc.o 
+obj-$(CONFIG_CKRM_RES_NUMTASKS) += ckrm_tasks.o
+
+obj-$(CONFIG_CKRM_TYPE_SOCKETCLASS) += ckrm_sockc.o        
+obj-$(CONFIG_CKRM_RES_LISTENAQ) += ckrm_listenaq.o  
+
diff --git a/kernel/ckrm/ckrm.c b/kernel/ckrm/ckrm.c
new file mode 100644 (file)
index 0000000..43d14a8
--- /dev/null
@@ -0,0 +1,1009 @@
+/* ckrm.c - Class-based Kernel Resource Management (CKRM)
+ *
+ * Copyright (C) Hubertus Franke, IBM Corp. 2003, 2004
+ *           (C) Shailabh Nagar,  IBM Corp. 2003, 2004
+ *           (C) Chandra Seetharaman,  IBM Corp. 2003
+ *          (C) Vivek Kashyap, IBM Corp. 2004
+ * 
+ * 
+ * Provides kernel API of CKRM for in-kernel,per-resource controllers 
+ * (one each for cpu, memory, io, network) and callbacks for 
+ * classification modules.
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Changes
+ *
+ * 28 Aug 2003
+ *        Created.
+ * 06 Nov 2003
+ *        Made modifications to suit the new RBCE module.
+ * 10 Nov 2003
+ *        Fixed a bug in fork and exit callbacks. Added callbacks_active and
+ *        surrounding logic. Added task paramter for all CE callbacks.
+ * 23 Mar 2004
+ *        moved to referenced counted class objects and correct locking
+ * 19 Apr 2004
+ *        Integrated ckrm hooks, classtypes, ...
+ *  
+ */
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <asm/uaccess.h>
+#include <linux/mm.h>
+#include <asm/errno.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/ckrm_rc.h>
+#include <linux/rcfs.h>
+#include <net/sock.h>
+#include <linux/ip.h>
+
+
+rwlock_t ckrm_class_lock = RW_LOCK_UNLOCKED;  // protect classlists 
+
+struct rcfs_functions rcfs_fn ;
+EXPORT_SYMBOL(rcfs_fn);
+
+
+/**************************************************************************
+ *                   Helper Functions                                     *
+ **************************************************************************/
+
+/*
+ * Return TRUE if the given core class pointer is valid.
+ */
+
+/*
+ * Return TRUE if the given resource is registered.
+ */
+inline unsigned int
+is_res_regd(struct ckrm_classtype *clstype, int resid)
+{
+       return ( (resid>=0) && (resid < clstype->max_resid) &&
+                test_bit(resid, &clstype->bit_res_ctlrs)
+               );
+}
+
+struct ckrm_res_ctlr*
+ckrm_resctlr_lookup(struct ckrm_classtype *clstype, const char *resname)
+{
+       int resid = -1;
+       
+       for (resid=0; resid < clstype->max_resid; resid++) { 
+               if (test_bit(resid, &clstype->bit_res_ctlrs)) {
+                       struct ckrm_res_ctlr *rctrl = clstype->res_ctlrs[resid];
+                       if (!strncmp(resname, rctrl->res_name,CKRM_MAX_RES_NAME))
+                               return rctrl;
+               }
+       }
+       return NULL;
+}
+EXPORT_SYMBOL(ckrm_resctlr_lookup);
+
+/* given a classname return the class handle and its classtype*/
+void *
+ckrm_classobj(char *classname, int *classTypeID)
+{
+       int i;
+
+       *classTypeID = -1;
+       if (!classname || !*classname) {
+               return NULL;
+       }
+
+       read_lock(&ckrm_class_lock);
+       for ( i=0 ; i<CKRM_MAX_CLASSTYPES; i++) {
+               struct ckrm_classtype *ctype = ckrm_classtypes[i];
+               struct ckrm_core_class *core;
+
+               if (ctype == NULL) 
+                       continue;
+               list_for_each_entry(core, &ctype->classes, clslist) {
+                       if (core->name && !strcmp(core->name, classname)) {
+                               // FIXME:   should grep reference..
+                               read_unlock(&ckrm_class_lock);
+                               *classTypeID = ctype->typeID;
+                               return core;
+                       }
+               }
+       }
+       read_unlock(&ckrm_class_lock);
+       return NULL;
+}
+
+EXPORT_SYMBOL(is_res_regd);
+EXPORT_SYMBOL(ckrm_classobj);
+
+/**************************************************************************
+ *                   Internal Functions/macros                            *
+ **************************************************************************/
+
+static inline void 
+set_callbacks_active(struct ckrm_classtype *ctype)
+{
+       ctype->ce_cb_active = ((atomic_read(&ctype->ce_nr_users) > 0) &&
+                              (ctype->ce_callbacks.always_callback || (ctype->num_classes > 1)));
+}
+
+int
+ckrm_validate_and_grab_core(struct ckrm_core_class *core)
+{
+       int rc = 0;
+       read_lock(&ckrm_class_lock);
+       if (likely(ckrm_is_core_valid(core))) {
+               ckrm_core_grab(core);
+               rc = 1;
+       }
+       read_unlock(&ckrm_class_lock);
+       return rc;
+}
+
+/****************************************************************************
+ *           Interfaces for classification engine                           *
+ ****************************************************************************/
+
+/*
+ * Registering a callback structure by the classification engine.
+ *
+ * Returns typeId of class on success -errno for failure.
+ */
+int
+ckrm_register_engine(const char *typename, ckrm_eng_callback_t *ecbs)
+{
+       struct ckrm_classtype *ctype;
+
+       ctype = ckrm_find_classtype_by_name(typename);
+       if (ctype == NULL) 
+               return (-ENOENT);
+
+       ce_protect(ctype);
+       if (atomic_read(&ctype->ce_nr_users) != 1) {
+               // Some engine is acive, deregister it first.
+               ce_release(ctype);
+               return (-EBUSY);
+       }
+       
+       /* we require that either classify and class_delete are set (due to object reference)
+        * or that notify is set (in case no real classification is supported only notification
+        * also require that the function pointer be set the momement the mask is non-null
+        */
+       if ( ! (((ecbs->classify) && (ecbs->class_delete)) || (ecbs->notify)) ||
+            (ecbs->c_interest && ecbs->classify == NULL) ||
+            (ecbs->n_interest && ecbs->notify == NULL) )
+       {
+               ce_release(ctype);
+               return (-EINVAL);
+       }
+       
+
+       /* Is any other engine registered for this classtype ? */
+       if (ctype->ce_regd) {
+               ce_release(ctype);
+               return (-EINVAL);
+       }
+       
+       ctype->ce_regd = 1;
+       ctype->ce_callbacks = *ecbs;
+       set_callbacks_active(ctype);
+       if (ctype->ce_callbacks.class_add) 
+               (*ctype->ce_callbacks.class_add)(ctype->default_class->name,ctype->default_class);
+       return ctype->typeID;
+}
+
+/*
+ * Unregistering a callback structure by the classification engine.
+ *
+ * Returns 0 on success -errno for failure.
+ */
+int
+ckrm_unregister_engine(const char *typename)
+{
+       struct ckrm_classtype *ctype;
+
+       ctype = ckrm_find_classtype_by_name(typename);
+       if (ctype == NULL) 
+               return (-ENOENT);
+
+       ctype->ce_cb_active = 0; 
+
+       if (atomic_dec_and_test(&ctype->ce_nr_users) != 1) {
+               // Somebody is currently using the engine, cannot deregister.
+               atomic_inc(&ctype->ce_nr_users);
+               return (-EBUSY);
+       }
+
+       ctype->ce_regd = 0;
+       memset(&ctype->ce_callbacks, 0, sizeof(ckrm_eng_callback_t));
+       return 0;
+}
+
+/****************************************************************************
+ *           Interfaces to manipulate class (core or resource) hierarchies 
+ ****************************************************************************/
+
+/* 
+ */
+static void
+ckrm_add_child(struct ckrm_core_class *parent, struct ckrm_core_class *child)
+{
+       struct ckrm_hnode *cnode = &child->hnode;
+
+       if (!ckrm_is_core_valid(child)) {
+               printk(KERN_ERR "Invalid child %p given in ckrm_add_child\n", child);
+               return;
+       }
+       
+       class_lock(child);
+       INIT_LIST_HEAD(&cnode->children);
+       INIT_LIST_HEAD(&cnode->siblings);
+
+       if (parent) {
+               struct ckrm_hnode *pnode;
+
+               if (!ckrm_is_core_valid(parent)) {
+                       printk(KERN_ERR "Invalid parent %p given in ckrm_add_child\n",
+                                       parent);
+                       parent = NULL;
+               } else {
+                       pnode = &parent->hnode;
+                       write_lock(&parent->hnode_rwlock);
+                       list_add(&cnode->siblings, &pnode->children);
+                       write_unlock(&parent->hnode_rwlock);
+               }
+       }
+       cnode->parent = parent;
+       class_unlock(child);
+       return;
+}
+
+/* 
+ */
+static int
+ckrm_remove_child(struct ckrm_core_class *child)
+{
+       struct ckrm_hnode *cnode, *pnode;
+       struct ckrm_core_class *parent;
+
+       if (!ckrm_is_core_valid(child)) {
+               printk(KERN_ERR "Invalid child %p given in ckrm_remove_child\n", child);
+               return 0;
+       }
+
+       cnode = &child->hnode;
+       parent = cnode->parent;
+       if (!ckrm_is_core_valid(parent)) {
+               printk(KERN_ERR "Invalid parent %p in ckrm_remove_child\n", parent);
+               return 0;
+       }
+
+       pnode = &parent->hnode;
+
+       class_lock(child);
+       /* ensure that the node does not have children */
+       if (!list_empty(&cnode->children)) {
+               class_unlock(child);
+               return 0;
+       }
+       write_lock(&parent->hnode_rwlock);
+       list_del(&cnode->siblings);
+       write_unlock(&parent->hnode_rwlock);
+       cnode->parent = NULL;
+       class_unlock(child);
+       return 1;
+}
+
+void
+ckrm_lock_hier(struct ckrm_core_class *parent)
+{
+       if (ckrm_is_core_valid(parent)) {
+               read_lock(&parent->hnode_rwlock);
+       }
+}
+
+void 
+ckrm_unlock_hier(struct ckrm_core_class *parent)
+{
+       if (ckrm_is_core_valid(parent)) {
+               read_unlock(&parent->hnode_rwlock);
+       }
+}
+
+/*
+ * hnode_rwlock of the parent core class must held in read mode.
+ * external callers should 've called ckrm_lock_hier before calling this
+ * function.
+ */
+#define hnode_2_core(ptr) ((ptr) ? container_of(ptr, struct ckrm_core_class, hnode) : NULL)
+
+struct ckrm_core_class *
+ckrm_get_next_child(struct ckrm_core_class *parent,
+                       struct ckrm_core_class *child)
+{
+       struct list_head *cnode;
+       struct ckrm_hnode *next_cnode;
+       struct ckrm_core_class *next_childcore;
+
+       if (!ckrm_is_core_valid(parent)) {
+               printk(KERN_ERR "Invalid parent %p in ckrm_get_next_child\n", parent);
+               return NULL;
+       }
+       if (list_empty(&parent->hnode.children)) {
+               return NULL;
+       }
+
+       if (child) {
+               if (!ckrm_is_core_valid(child)) {
+                       printk(KERN_ERR "Invalid child %p in ckrm_get_next_child\n", child);
+                       return NULL;
+               }
+               cnode = child->hnode.siblings.next;
+       } else {
+               cnode = parent->hnode.children.next;
+       }
+
+       if (cnode == &parent->hnode.children) { // back at the anchor
+               return NULL;
+       }
+
+       next_cnode = container_of(cnode, struct ckrm_hnode, siblings);
+       next_childcore = hnode_2_core(next_cnode);
+
+       if (!ckrm_is_core_valid(next_childcore)) {
+               printk(KERN_ERR "Invalid next child %p in ckrm_get_next_child\n",
+                               next_childcore);
+               return NULL;
+       }
+       return next_childcore;
+}
+
+EXPORT_SYMBOL(ckrm_lock_hier);
+EXPORT_SYMBOL(ckrm_unlock_hier);
+EXPORT_SYMBOL(ckrm_get_next_child);
+
+static void 
+ckrm_alloc_res_class(struct ckrm_core_class *core,
+                    struct ckrm_core_class *parent,
+                    int resid)
+{
+
+       struct ckrm_classtype *clstype;
+
+       /* 
+        * Allocate a resource class only if the resource controller has
+        * registered with core and the engine requests for the class.
+        */
+
+       if (!ckrm_is_core_valid(core))
+               return ; 
+
+       clstype = core->classtype;
+       core->res_class[resid] = NULL;
+
+       if (test_bit(resid, &clstype->bit_res_ctlrs)) {
+               ckrm_res_ctlr_t *rcbs;
+
+               atomic_inc(&clstype->nr_resusers[resid]);
+               rcbs = clstype->res_ctlrs[resid];
+               
+               if (rcbs && rcbs->res_alloc) {
+                       core->res_class[resid] =(*rcbs->res_alloc)(core,parent);
+                       if (core->res_class[resid])
+                               return;
+                       printk(KERN_ERR "Error creating res class\n");
+               }
+               atomic_dec(&clstype->nr_resusers[resid]);
+       }
+}
+
+/*
+ * Initialize a core class
+ *
+ */
+
+#define CLS_DEBUG(fmt, args...) do { /* printk("%s: " fmt, __FUNCTION__ , ## args); */ } while (0)
+
+
+int
+ckrm_init_core_class(struct ckrm_classtype  *clstype,
+                    struct ckrm_core_class *dcore,
+                    struct ckrm_core_class *parent,
+                    const char *name)
+{
+       // Hubertus   ... should replace name with dentry or add dentry ?
+       int i;
+
+       // Hubertus .. how is this used in initialization 
+
+       CLS_DEBUG("name %s => %p\n", name?name:"default",dcore);
+       
+       if ((dcore != clstype->default_class) && ( !ckrm_is_core_valid(parent))) {
+               printk("error not a valid parent %p\n", parent);
+               return -EINVAL;
+       }
+#if 0  // Hubertus .. dynamic allocation still breaks when RCs registers. See def in ckrm_rc.h
+       dcore->res_class = NULL;
+       if (clstype->max_resid > 0) {
+               dcore->res_class = (void**)kmalloc(clstype->max_resid * sizeof(void*) , GFP_KERNEL);
+               if (dcore->res_class == NULL) {
+                       printk("error no mem\n");
+                       return -ENOMEM;
+               }
+       }
+#endif
+
+       dcore->classtype    = clstype;
+       dcore->magic        = CKRM_CORE_MAGIC;
+       dcore->name         = name;
+       dcore->class_lock   = SPIN_LOCK_UNLOCKED;
+       dcore->hnode_rwlock = RW_LOCK_UNLOCKED;
+       dcore->delayed      = 0;
+
+       atomic_set(&dcore->refcnt, 0);
+       write_lock(&ckrm_class_lock);
+
+       INIT_LIST_HEAD(&dcore->objlist);
+       list_add(&dcore->clslist,&clstype->classes);
+
+       clstype->num_classes++;
+       set_callbacks_active(clstype);
+
+       write_unlock(&ckrm_class_lock);
+       ckrm_add_child(parent, dcore); 
+
+       for (i = 0; i < clstype->max_resid; i++) 
+               ckrm_alloc_res_class(dcore,parent,i);
+
+       // fix for race condition seen in stress with numtasks
+       if (parent) 
+               ckrm_core_grab(parent);
+
+       ckrm_core_grab( dcore );
+       return 0;
+}
+
+
+static void 
+ckrm_free_res_class(struct ckrm_core_class *core, int resid)
+{
+       /* 
+        * Free a resource class only if the resource controller has
+        * registered with core 
+        */
+
+       if (core->res_class[resid]) {
+               ckrm_res_ctlr_t *rcbs;
+               struct ckrm_classtype *clstype = core->classtype;
+
+               atomic_inc(&clstype->nr_resusers[resid]);
+               rcbs = clstype->res_ctlrs[resid];
+
+               if (rcbs->res_free) {
+                       (*rcbs->res_free)(core->res_class[resid]);
+                       atomic_dec(&clstype->nr_resusers[resid]); // for inc in alloc
+                       core->res_class[resid] = NULL;  
+               }
+               atomic_dec(&clstype->nr_resusers[resid]);
+       }
+}
+
+
+/*
+ * Free a core class 
+ *   requires that all tasks were previously reassigned to another class
+ *
+ * Returns 0 on success -errno on failure.
+ */
+
+void
+ckrm_free_core_class(struct ckrm_core_class *core)
+{
+       int i;
+       struct ckrm_classtype *clstype = core->classtype;
+       struct ckrm_core_class *parent = core->hnode.parent;
+       
+       CLS_DEBUG("core=%p:%s parent=%p:%s\n",core,core->name,parent,parent->name);
+       if (core->delayed) {
+               /* this core was marked as late */
+               printk("class <%s> finally deleted %lu\n",core->name,jiffies);
+       }
+       if (ckrm_remove_child(core) == 0) {
+               printk("Core class removal failed. Chilren present\n");
+       }
+
+       for (i = 0; i < clstype->max_resid; i++) {
+               ckrm_free_res_class(core,i);
+       }
+
+       write_lock(&ckrm_class_lock);
+
+       // Clear the magic, so we would know if this core is reused.
+       core->magic = 0;
+#if 0 // Dynamic not yet enabled
+       core->res_class = NULL;
+#endif
+       // Remove this core class from its linked list.
+       list_del(&core->clslist);
+       clstype->num_classes--;
+       set_callbacks_active(clstype);
+       write_unlock(&ckrm_class_lock);
+
+       // fix for race condition seen in stress with numtasks
+       if (parent) 
+               ckrm_core_drop(parent);
+       kfree(core);
+}
+
+int
+ckrm_release_core_class(struct ckrm_core_class *core)
+{
+       if (!ckrm_is_core_valid(core)) {
+               // Invalid core
+               return (-EINVAL);
+       }
+
+       if (core == core->classtype->default_class)
+               return 0;
+
+       /* need to make sure that the classgot really dropped */
+       if (atomic_read(&core->refcnt) != 1) {
+               CLS_DEBUG("class <%s> deletion delayed refcnt=%d jif=%ld\n",
+                         core->name,atomic_read(&core->refcnt),jiffies);
+               core->delayed = 1;  /* just so we have a ref point */
+       }
+       ckrm_core_drop(core);
+       return 0;
+}
+
+/****************************************************************************
+ *           Interfaces for the resource controller                         *
+ ****************************************************************************/
+/*
+ * Registering a callback structure by the resource controller.
+ *
+ * Returns the resource id(0 or +ve) on success, -errno for failure.
+ */
+static int
+ckrm_register_res_ctlr_intern(struct ckrm_classtype *clstype, ckrm_res_ctlr_t *rcbs)
+{
+       int  resid, ret,i;
+       
+       if (!rcbs)
+               return -EINVAL;
+
+       resid = rcbs->resid;
+       
+       spin_lock(&clstype->res_ctlrs_lock);
+       
+       printk(KERN_WARNING "resid is %d name is %s %s\n", 
+              resid, rcbs->res_name,clstype->res_ctlrs[resid]->res_name);
+
+       if (resid >= 0) {
+               if ((resid < CKRM_MAX_RES_CTLRS) && (clstype->res_ctlrs[resid] == NULL)) {
+                       clstype->res_ctlrs[resid] = rcbs;
+                       atomic_set(&clstype->nr_resusers[resid], 0);
+                       set_bit(resid, &clstype->bit_res_ctlrs);        
+                       ret = resid;
+                       if (resid >= clstype->max_resid) {
+                               clstype->max_resid = resid + 1;
+                       }
+               } else {
+                       ret = -EBUSY;
+               }
+               spin_unlock(&clstype->res_ctlrs_lock);
+               return ret;
+       }
+
+       for (i = clstype->resid_reserved; i < clstype->max_res_ctlrs; i++) {
+               if (clstype->res_ctlrs[i] == NULL) {
+                       clstype->res_ctlrs[i] = rcbs;
+                       rcbs->resid = i;
+                       atomic_set(&clstype->nr_resusers[i], 0);
+                       set_bit(i, &clstype->bit_res_ctlrs);    
+                       if (i >= clstype->max_resid) {
+                               clstype->max_resid = i + 1;
+                       }
+                       spin_unlock(&clstype->res_ctlrs_lock);
+                       return i;
+               }
+       }
+       
+       spin_unlock(&clstype->res_ctlrs_lock);
+       return (-ENOMEM);
+}
+
+int
+ckrm_register_res_ctlr(struct ckrm_classtype *clstype, ckrm_res_ctlr_t *rcbs)
+{
+       struct ckrm_core_class *core;
+       int resid;
+       
+       resid = ckrm_register_res_ctlr_intern(clstype,rcbs);
+       
+       if (resid >= 0) {
+               /* run through all classes and create the resource class object and
+                * if necessary "initialize" class in context of this resource 
+                */
+               read_lock(&ckrm_class_lock);
+               list_for_each_entry(core, &clstype->classes, clslist) {
+                       printk("CKRM .. create res clsobj for resouce <%s> class <%s> par=%p\n", 
+                              rcbs->res_name, core->name, core->hnode.parent);
+                       ckrm_alloc_res_class(core, core->hnode.parent, resid);
+                       if (clstype->add_resctrl)  // FIXME: this should be mandatory
+                               (*clstype->add_resctrl)(core,resid);
+               }
+               read_unlock(&ckrm_class_lock);
+       }
+       return resid;
+}
+
+/*
+ * Unregistering a callback structure by the resource controller.
+ *
+ * Returns 0 on success -errno for failure.
+ */
+int
+ckrm_unregister_res_ctlr(struct ckrm_res_ctlr *rcbs)
+{      
+       struct ckrm_classtype *clstype = rcbs->classtype;
+       int resid = rcbs->resid;
+
+       if ((clstype == NULL) || (resid < 0))
+               return -EINVAL;
+       
+       if (atomic_read(&clstype->nr_resusers[resid]))
+               return -EBUSY;
+       
+       // FIXME: probably need to also call deregistration function
+
+       spin_lock(&clstype->res_ctlrs_lock);
+       clstype->res_ctlrs[resid] = NULL;
+       clear_bit(resid, &clstype->bit_res_ctlrs);      
+       clstype->max_resid = fls(clstype->bit_res_ctlrs);
+       rcbs->resid = -1;
+       spin_unlock(&clstype->res_ctlrs_lock);
+       
+       return 0;
+}
+
+/*******************************************************************
+ *   Class Type Registration
+ *******************************************************************/
+
+/* Hubertus ... we got to do some locking here */
+
+struct ckrm_classtype* ckrm_classtypes[CKRM_MAX_CLASSTYPES];
+EXPORT_SYMBOL(ckrm_classtypes);     // really should build a better interface for this
+
+int
+ckrm_register_classtype(struct ckrm_classtype *clstype)
+{
+       int tid = clstype->typeID;
+
+       if (tid != -1) {
+               if ((tid < 0) || (tid > CKRM_MAX_CLASSTYPES) || (ckrm_classtypes[tid]))
+                       return -EINVAL;
+       } else {
+               int i;
+               for ( i=CKRM_RESV_CLASSTYPES ; i<CKRM_MAX_CLASSTYPES; i++) {
+                       if (ckrm_classtypes[i] == NULL) {
+                               tid = i;
+                               break;
+                       }
+               }
+       }
+       if (tid == -1) 
+               return -EBUSY;
+       clstype->typeID = tid;
+       ckrm_classtypes[tid] = clstype;
+       
+       /* Hubertus .. we need to call the callbacks of the RCFS client */
+       if (rcfs_fn.register_classtype) {
+               (* rcfs_fn.register_classtype)(clstype);
+               // No error return for now ;
+       }
+
+       return tid;
+}
+
+int
+ckrm_unregister_classtype(struct ckrm_classtype *clstype)
+{
+       int tid = clstype->typeID;
+
+       if ((tid < 0) || (tid > CKRM_MAX_CLASSTYPES) || (ckrm_classtypes[tid] != clstype))
+               return -EINVAL;
+
+       if (rcfs_fn.deregister_classtype) {
+               (* rcfs_fn.deregister_classtype)(clstype);
+               // No error return for now
+       }
+
+       ckrm_classtypes[tid] = NULL;
+       clstype->typeID = -1;
+       return 0;
+}
+
+struct ckrm_classtype*
+ckrm_find_classtype_by_name(const char *name)
+{
+       int i;
+       for ( i=0 ; i<CKRM_MAX_CLASSTYPES; i++) {
+               struct ckrm_classtype *ctype = ckrm_classtypes[i];
+               if (ctype && !strncmp(ctype->name,name,CKRM_MAX_TYPENAME_LEN)) 
+                       return ctype;
+       }
+       return NULL;
+}
+
+
+/*******************************************************************
+ *   Event callback invocation
+ *******************************************************************/
+
+struct ckrm_hook_cb* ckrm_event_callbacks[CKRM_NONLATCHABLE_EVENTS];
+
+/* Registration / Deregistration / Invocation functions */
+
+int
+ckrm_register_event_cb(enum ckrm_event ev, struct ckrm_hook_cb *cb)
+{
+       struct ckrm_hook_cb **cbptr;
+
+       if ((ev < CKRM_LATCHABLE_EVENTS) || (ev >= CKRM_NONLATCHABLE_EVENTS))
+               return 1;
+       cbptr = &ckrm_event_callbacks[ev];
+       while (*cbptr != NULL) 
+               cbptr = &((*cbptr)->next);
+       *cbptr = cb;
+       return 0;
+}
+
+int
+ckrm_unregister_event_cb(enum ckrm_event ev, struct ckrm_hook_cb *cb)
+{
+       struct ckrm_hook_cb **cbptr;
+
+       if ((ev < CKRM_LATCHABLE_EVENTS) || (ev >= CKRM_NONLATCHABLE_EVENTS))
+               return -1;
+       cbptr = &ckrm_event_callbacks[ev];
+       while ((*cbptr != NULL) && (*cbptr != cb))
+               cbptr = &((*cbptr)->next);
+       if (*cbptr)
+               (*cbptr)->next = cb->next;
+       return (*cbptr == NULL);
+}
+
+int
+ckrm_register_event_set(struct ckrm_event_spec especs[])
+{
+       struct ckrm_event_spec *espec = especs;
+
+       for ( espec = especs ; espec->ev != -1 ; espec++ )
+               ckrm_register_event_cb(espec->ev,&espec->cb);
+       return 0;
+}
+
+int
+ckrm_unregister_event_set(struct ckrm_event_spec especs[])
+{
+       struct ckrm_event_spec *espec = especs;
+
+       for ( espec = especs ; espec->ev != -1 ; espec++ )
+               ckrm_unregister_event_cb(espec->ev,&espec->cb);
+       return 0;
+}
+
+#define ECC_PRINTK(fmt, args...) // printk("%s: " fmt, __FUNCTION__ , ## args)
+
+void
+ckrm_invoke_event_cb_chain(enum ckrm_event ev, void *arg)
+{
+       struct ckrm_hook_cb *cb, *anchor;
+
+       ECC_PRINTK("%d %x\n",current,ev,arg);
+       if ((anchor = ckrm_event_callbacks[ev]) != NULL) {
+               for ( cb = anchor ; cb ; cb = cb->next ) 
+                       (*cb->fct)(arg);
+       }
+}
+
+/*******************************************************************
+ *   Generic Functions that can be used as default functions 
+ *   in almost all classtypes
+ *     (a) function iterator over all resource classes of a class
+ *     (b) function invoker on a named resource
+ *******************************************************************/
+
+int                      
+ckrm_class_show_shares(struct ckrm_core_class *core, struct seq_file *seq)
+{
+       int i;
+       struct ckrm_res_ctlr *rcbs;
+       struct ckrm_classtype *clstype = core->classtype;
+       struct ckrm_shares shares;
+
+       for (i = 0; i < clstype->max_resid; i++) {
+               atomic_inc(&clstype->nr_resusers[i]);
+               rcbs = clstype->res_ctlrs[i];
+               if (rcbs && rcbs->get_share_values) {
+                       (*rcbs->get_share_values)(core->res_class[i], &shares);
+                       seq_printf(seq,"res=%s,guarantee=%d,limit=%d,total_guarantee=%d,max_limit=%d\n",
+                                  rcbs->res_name,
+                                  shares.my_guarantee,
+                                  shares.my_limit,
+                                  shares.total_guarantee,
+                                  shares.max_limit);
+               }
+               atomic_dec(&clstype->nr_resusers[i]);
+       }
+       return 0;
+}
+
+int                      
+ckrm_class_show_stats(struct ckrm_core_class *core, struct seq_file *seq)
+{
+       int i;
+       struct ckrm_res_ctlr *rcbs;
+       struct ckrm_classtype *clstype = core->classtype;
+
+       for (i = 0; i < clstype->max_resid; i++) {
+               atomic_inc(&clstype->nr_resusers[i]);
+               rcbs = clstype->res_ctlrs[i];
+               if (rcbs && rcbs->get_stats) 
+                       (*rcbs->get_stats)(core->res_class[i], seq);
+               atomic_dec(&clstype->nr_resusers[i]);
+       }
+       return 0;
+}
+
+int                      
+ckrm_class_show_config(struct ckrm_core_class *core, struct seq_file *seq)
+{
+       int i;
+       struct ckrm_res_ctlr *rcbs;
+       struct ckrm_classtype *clstype = core->classtype;
+
+       for (i = 0; i < clstype->max_resid; i++) {
+               atomic_inc(&clstype->nr_resusers[i]);
+               rcbs = clstype->res_ctlrs[i];
+               if (rcbs && rcbs->show_config) 
+                       (*rcbs->show_config)(core->res_class[i], seq);
+               atomic_dec(&clstype->nr_resusers[i]);
+       }
+       return 0;
+}
+
+int
+ckrm_class_set_config(struct ckrm_core_class *core, const char *resname, const char *cfgstr)
+{
+       struct ckrm_classtype *clstype = core->classtype;
+       struct ckrm_res_ctlr *rcbs = ckrm_resctlr_lookup(clstype,resname);
+       int rc;
+
+       if (rcbs == NULL || rcbs->set_config == NULL)
+               return -EINVAL; 
+       rc = (*rcbs->set_config)(core->res_class[rcbs->resid],cfgstr);
+       return rc;
+}
+
+int
+ckrm_class_set_shares(struct ckrm_core_class *core, const char *resname,
+                     struct ckrm_shares *shares)
+{
+       struct ckrm_classtype *clstype = core->classtype;
+       struct ckrm_res_ctlr *rcbs;
+       int rc;
+
+       printk("ckrm_class_set_shares(%s,%s)\n",core->name,resname);
+       rcbs = ckrm_resctlr_lookup(clstype,resname);
+       if (rcbs == NULL || rcbs->set_share_values == NULL)
+               return -EINVAL; 
+       rc = (*rcbs->set_share_values)(core->res_class[rcbs->resid],shares);
+       return rc;
+}
+
+int 
+ckrm_class_reset_stats(struct ckrm_core_class *core, const char *resname, const char *unused)
+{
+       struct ckrm_classtype *clstype = core->classtype;
+       struct ckrm_res_ctlr *rcbs = ckrm_resctlr_lookup(clstype,resname);
+       int rc;
+
+       if (rcbs == NULL || rcbs->reset_stats == NULL)
+               return -EINVAL; 
+       rc = (*rcbs->reset_stats)(core->res_class[rcbs->resid]);
+       return rc;
+}      
+
+/*******************************************************************
+ *   Initialization 
+ *******************************************************************/
+
+void
+ckrm_cb_newtask(struct task_struct *tsk)
+{
+       tsk->ce_data   = NULL;
+       spin_lock_init(&tsk->ckrm_tsklock);
+       ckrm_invoke_event_cb_chain(CKRM_EVENT_NEWTASK,tsk);
+}
+
+void 
+ckrm_cb_exit(struct task_struct *tsk)
+{
+       ckrm_invoke_event_cb_chain(CKRM_EVENT_EXIT,tsk);
+       tsk->ce_data = NULL;
+}
+
+void __init
+ckrm_init(void) 
+{
+       printk("CKRM Initialization\n");
+       
+       // register/initialize the Metatypes
+       
+#ifdef CONFIG_CKRM_TYPE_TASKCLASS
+       { 
+               extern void ckrm_meta_init_taskclass(void);
+               ckrm_meta_init_taskclass();
+       }
+#endif
+#ifdef CONFIG_CKRM_TYPE_SOCKETCLASS
+       { 
+               extern void ckrm_meta_init_sockclass(void);
+               ckrm_meta_init_sockclass();
+       }
+#endif
+       // prepare init_task and then rely on inheritance of properties
+       ckrm_cb_newtask(&init_task);
+       printk("CKRM Initialization done\n");
+}
+
+EXPORT_SYMBOL(ckrm_register_engine);
+EXPORT_SYMBOL(ckrm_unregister_engine);
+
+EXPORT_SYMBOL(ckrm_register_res_ctlr);
+EXPORT_SYMBOL(ckrm_unregister_res_ctlr);
+
+EXPORT_SYMBOL(ckrm_init_core_class);
+EXPORT_SYMBOL(ckrm_free_core_class);
+EXPORT_SYMBOL(ckrm_release_core_class);
+
+EXPORT_SYMBOL(ckrm_register_classtype);
+EXPORT_SYMBOL(ckrm_unregister_classtype);
+EXPORT_SYMBOL(ckrm_find_classtype_by_name);
+
+EXPORT_SYMBOL(ckrm_core_grab);
+EXPORT_SYMBOL(ckrm_core_drop);
+EXPORT_SYMBOL(ckrm_is_core_valid);
+EXPORT_SYMBOL(ckrm_validate_and_grab_core);
+
+EXPORT_SYMBOL(ckrm_register_event_set);
+EXPORT_SYMBOL(ckrm_unregister_event_set);
+EXPORT_SYMBOL(ckrm_register_event_cb);
+EXPORT_SYMBOL(ckrm_unregister_event_cb);
+
+EXPORT_SYMBOL(ckrm_class_show_stats);
+EXPORT_SYMBOL(ckrm_class_show_config);
+EXPORT_SYMBOL(ckrm_class_show_shares);
+
+EXPORT_SYMBOL(ckrm_class_set_config);
+EXPORT_SYMBOL(ckrm_class_set_shares);
+
+EXPORT_SYMBOL(ckrm_class_reset_stats);
+
+
diff --git a/kernel/ckrm/ckrm_listenaq.c b/kernel/ckrm/ckrm_listenaq.c
new file mode 100644 (file)
index 0000000..235ac06
--- /dev/null
@@ -0,0 +1,503 @@
+/* ckrm_socketaq.c - accept queue resource controller
+ *
+ * Copyright (C) Vivek Kashyap,      IBM Corp. 2004
+ * 
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Changes
+ * Initial version
+ */
+
+/* Code Description: TBD
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <asm/errno.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/ckrm.h>
+#include <linux/ckrm_rc.h>
+#include <net/tcp.h>
+
+#include <linux/ckrm_net.h>
+
+#define hnode_2_core(ptr) \
+                ((ptr) ? container_of(ptr, struct ckrm_core_class, hnode) : NULL)
+
+
+#define CKRM_SAQ_MAX_DEPTH     3 // 0 => /rcfs
+                                 // 1 => socket_aq
+                                 // 2 => socket_aq/listen_class
+                                 // 3 => socket_aq/listen_class/accept_queues
+                                 // 4 => Not allowed
+
+typedef struct ckrm_laq_res {
+       spinlock_t              reslock;
+       atomic_t                refcnt;
+       struct ckrm_shares      shares;
+       struct ckrm_core_class *core;
+       struct ckrm_core_class *pcore;
+       int                     my_depth;
+       int                     my_id;
+} ckrm_laq_res_t;
+
+static int my_resid = -1;
+
+extern struct ckrm_core_class *rcfs_create_under_netroot(char *, int, int);
+extern struct ckrm_core_class *rcfs_make_core(struct dentry *, 
+                                               struct ckrm_core_class * ) ;
+
+void
+laq_res_hold(struct ckrm_laq_res *res)
+{
+        atomic_inc(&res->refcnt);
+       return;
+}
+
+void
+laq_res_put(struct ckrm_laq_res *res)
+{
+       if (atomic_dec_and_test(&res->refcnt))
+               kfree(res);
+       return;
+}
+
+/* Initialize rescls values
+ */
+static void
+laq_res_initcls(void *my_res)
+{
+       ckrm_laq_res_t *res = my_res;
+
+       res->shares.my_guarantee     = CKRM_SHARE_DONTCARE;
+       res->shares.my_limit         = CKRM_SHARE_DONTCARE;
+       res->shares.total_guarantee  = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
+       res->shares.max_limit        = CKRM_SHARE_DFLT_MAX_LIMIT;
+       res->shares.unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
+       res->shares.cur_max_limit    = 0;
+}
+
+static int 
+atoi(char *s)
+{
+       int k = 0;
+       while(*s) 
+               k = *s++ - '0' + (k * 10);
+       return k;
+}
+
+static char *
+laq_get_name(struct ckrm_core_class *c)
+{
+        char *p = (char *)c->name;
+
+        while(*p)
+                p++;
+        while( *p != '/' && p != c->name)
+                p--;
+
+        return ++p;
+}
+
+static void *
+laq_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent)
+{
+       ckrm_laq_res_t *res, *pres;
+       int pdepth;
+
+       if (parent)
+               pres = ckrm_get_res_class(parent, my_resid, ckrm_laq_res_t);
+       else
+               pres = NULL;
+
+       if (core == core->classtype->default_class)    
+               pdepth = 1;
+       else {
+               if (!parent)
+                       return NULL;
+               pdepth = 1 + pres->my_depth;
+       }
+
+       res = kmalloc(sizeof(ckrm_laq_res_t), GFP_ATOMIC);
+       if (res) {
+               memset(res, 0, sizeof(res));
+               spin_lock_init(&res->reslock);
+               laq_res_hold(res);
+               res->my_depth  = pdepth;
+               if (pdepth == 2)        // listen class
+                       res->my_id = 0;
+               else if (pdepth == 3)
+                       res->my_id = atoi(laq_get_name(core));
+               res->core = core;
+               res->pcore = parent;
+
+               // rescls in place, now initialize contents other than 
+               // hierarchy pointers
+               laq_res_initcls(res); // acts as initialising value
+       }
+
+       return res;
+}
+
+static void
+laq_res_free(void *my_res)
+{
+       ckrm_laq_res_t *res = (ckrm_laq_res_t *)my_res;
+       ckrm_laq_res_t *parent;
+
+       if (!res) 
+               return;
+
+       if (res->my_depth != 3) {
+               kfree(res);
+               return;
+       }
+
+       parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
+       if (!parent)    // Should never happen
+               return;
+
+       spin_lock(&parent->reslock);
+       spin_lock(&res->reslock);
+
+       // return child's guarantee to parent node
+       // Limits have no meaning for accept queue control
+       child_guarantee_changed(&parent->shares, res->shares.my_guarantee, 0);
+
+       spin_unlock(&res->reslock);
+       laq_res_put(res);       
+       spin_unlock(&parent->reslock);
+       return;
+}
+
+/**************************************************************************
+ *                     SHARES                                          ***
+ **************************************************************************/
+
+void
+laq_set_aq_values(ckrm_laq_res_t *my_res, ckrm_laq_res_t *parent, int updatep)
+{
+
+       struct ckrm_net_struct *ns;
+       struct ckrm_core_class *core = parent->core;
+       struct tcp_opt *tp;
+       
+       if (my_res->my_depth < 2) 
+               return;
+       
+       // XXX Instead of holding a  class_lock introduce a rw
+       // lock to be write locked by listen callbacks and read locked here.
+       // - VK
+       class_lock(core);
+       list_for_each_entry(ns, &core->objlist,ckrm_link) { 
+               tp = tcp_sk(ns->ns_sk);
+               if (updatep)
+                       tp->acceptq[0].aq_ratio =
+                              parent->shares.total_guarantee/
+                               parent->shares.unused_guarantee;               
+
+               tp->acceptq[my_res->my_id].aq_ratio =
+                      my_res->shares.total_guarantee/
+                       parent->shares.my_guarantee;           
+       }
+       class_unlock(core);
+       return;
+}
+
+static int
+laq_set_share_values(void *my_res, struct ckrm_shares *shares)
+{
+       ckrm_laq_res_t *res = my_res;
+       ckrm_laq_res_t *parent, *child;
+       struct ckrm_hnode *chnode; 
+       int rc = 0;
+
+       if (!res) 
+               return -EINVAL;
+
+       if (!res->pcore) { 
+               // something is badly wrong
+               printk(KERN_ERR "socketaq internal inconsistency\n");
+               return -EBADF;
+       }
+
+       parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
+       if (!parent)    // socket_class does not have a share interface
+               return -EINVAL;
+
+       // Ensure that we ignore limit values
+       shares->my_limit = shares->max_limit = CKRM_SHARE_UNCHANGED;
+
+       switch (res->my_depth) {
+
+       case 0: printk(KERN_ERR "socketaq bad entry\n");
+               rc = -EBADF;
+               break;
+
+       case 1: // can't be written to. this is internal default.
+               // return -EINVAL
+               rc = -EINVAL;
+               break;
+
+       case 2: // nothing to inherit
+               if (!shares->total_guarantee) {
+                       rc = -EINVAL;
+                       break;
+               }
+
+               ckrm_lock_hier(res->pcore);
+               spin_lock(&res->reslock);
+               rc = set_shares(shares, &res->shares, NULL);
+               if (!rc) {
+                       list_for_each_entry(chnode,
+                                       &res->core->hnode.children,siblings){
+                               child=hnode_2_core(chnode)->res_class[my_resid];
+                               laq_set_aq_values(child,res,(child->my_id==1));
+                       }
+               }
+               spin_unlock(&res->reslock);
+               ckrm_unlock_hier(res->pcore);
+               break;
+
+       case 3: // accept queue itself. Check against parent.
+               ckrm_lock_hier(parent->pcore);
+               spin_lock(&parent->reslock);
+               rc = set_shares(shares, &res->shares, &parent->shares);
+               if (!rc) {
+                       laq_set_aq_values(res,parent,1);
+               }
+               spin_unlock(&parent->reslock);
+               ckrm_unlock_hier(parent->pcore);
+               break;
+       }
+
+       return rc;
+}
+
+static int
+laq_get_share_values(void *my_res, struct ckrm_shares *shares)
+{
+       ckrm_laq_res_t *res = my_res;
+
+       if (!res) 
+               return -EINVAL;
+       *shares = res->shares;
+       return 0;
+}
+
+/**************************************************************************
+ *                     STATS                                           ***
+ **************************************************************************/
+
+void
+laq_print_aq_stats(struct seq_file *sfile, struct tcp_acceptq_info *taq, int i)
+{
+       seq_printf(sfile, "Class %d connections:\n\taccepted: %u\n\t"
+                         "queued: %u\n\twait_time: %lu\n\t",
+                         i, taq->acceptq_count, taq->acceptq_qcount,
+                         taq->acceptq_wait_time);
+
+       if (i)
+               return;
+
+       for (i = 1; i < NUM_ACCEPT_QUEUES; i++) {
+               taq[0].acceptq_wait_time += taq[i].acceptq_wait_time;
+               taq[0].acceptq_qcount += taq[i].acceptq_qcount;
+               taq[0].acceptq_count += taq[i].acceptq_count;
+       }
+
+       seq_printf(sfile, "Totals :\n\taccepted: %u\n\t"
+                         "queued: %u\n\twait_time: %lu\n",
+                          taq->acceptq_count, taq->acceptq_qcount,
+                         taq->acceptq_wait_time);
+
+       return;
+}
+
+void
+laq_get_aq_stats(ckrm_laq_res_t *pres, ckrm_laq_res_t *mres, 
+                                       struct tcp_acceptq_info *taq)
+{
+       struct ckrm_net_struct *ns;
+       struct ckrm_core_class *core = pres->core;
+       struct tcp_opt *tp;
+       int a = mres->my_id;
+       int z;
+
+       if (a == 0)
+               z = NUM_ACCEPT_QUEUES;
+       else
+               z = a+1;
+
+       // XXX Instead of holding a  class_lock introduce a rw
+       // lock to be write locked by listen callbacks and read locked here.
+       // - VK
+       class_lock(pres->core);
+       list_for_each_entry(ns, &core->objlist,ckrm_link) { 
+               tp = tcp_sk(ns->ns_sk);
+               for (; a< z; a++) {
+                       taq->acceptq_wait_time += tp->acceptq[a].aq_wait_time;
+                       taq->acceptq_qcount += tp->acceptq[a].aq_qcount;
+                       taq->acceptq_count += tp->acceptq[a].aq_count;
+                       taq++;
+               }
+       }
+       class_unlock(pres->core);
+}
+
+
+static int  
+laq_get_stats(void *my_res, struct seq_file *sfile)
+{
+       ckrm_laq_res_t *res = my_res;
+       ckrm_laq_res_t *parent;
+       struct tcp_acceptq_info taq[NUM_ACCEPT_QUEUES];
+       int rc = 0;
+
+       if (!res) 
+               return -EINVAL;
+       
+       if (!res->pcore) { 
+               // something is badly wrong
+               printk(KERN_ERR "socketaq internal inconsistency\n");
+               return -EBADF;
+       }
+
+       parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
+       if (!parent) {  // socket_class does not have a stat interface
+               printk(KERN_ERR "socketaq internal fs inconsistency\n");
+               return -EINVAL;
+       }
+
+       memset(taq, 0, sizeof(struct tcp_acceptq_info) * NUM_ACCEPT_QUEUES);
+
+       switch (res->my_depth) {
+
+       default:
+       case 0: printk(KERN_ERR "socket class bad entry\n");
+               rc = -EBADF;
+               break;
+
+       case 1: // can't be read from. this is internal default.
+               // return -EINVAL
+               rc = -EINVAL;
+               break;
+
+       case 2: // return the default and total
+               ckrm_lock_hier(res->core);      // block any deletes
+               laq_get_aq_stats(res, res, &taq[0]);
+               laq_print_aq_stats(sfile, &taq[0], 0);
+               ckrm_unlock_hier(res->core);    // block any deletes
+               break;
+
+       case 3: 
+               ckrm_lock_hier(parent->core);   // block any deletes
+               laq_get_aq_stats(parent, res, &taq[res->my_id]);
+               laq_print_aq_stats(sfile, &taq[res->my_id], res->my_id);
+               ckrm_unlock_hier(parent->core); // block any deletes
+               break;
+       }
+
+       return rc;
+}
+
+/*
+ * The network connection is reclassified to this class. Update its shares.
+ * The socket lock is held. 
+ */
+static void
+laq_change_resclass(void *n, void *old, void *r)
+{
+       struct ckrm_net_struct *ns = (struct ckrm_net_struct *)n;
+       struct ckrm_laq_res *res = (struct ckrm_laq_res *)r;
+       struct ckrm_hnode  *chnode = NULL;
+
+
+       if (res->my_depth != 2) 
+               return; 
+
+       // a change to my_depth == 3 ie. the accept classes cannot happen.
+       // there is no target file
+       if (res->my_depth == 2) { // it is one of the socket classes
+               struct ckrm_laq_res *reschild;
+               struct sock *sk = ns->ns_sk; 
+               struct tcp_opt *tp = tcp_sk(sk);
+
+               // share rule: hold parent resource lock. then self.
+               // However, since my_depth == 1 is a generic class it is not
+               // needed here. Self lock is enough.
+               spin_lock(&res->reslock);
+               tp->acceptq[0].aq_ratio = res->shares.total_guarantee/
+                               res->shares.unused_guarantee;
+               list_for_each_entry(chnode,&res->core->hnode.children,siblings){
+                       reschild = hnode_2_core(chnode)->res_class[my_resid];
+
+                       spin_lock(&reschild->reslock);
+                       tp->acceptq[reschild->my_id].aq_ratio=
+                               reschild->shares.total_guarantee/
+                                       res->shares.my_guarantee;
+                       spin_unlock(&reschild->reslock);
+               }
+               spin_unlock(&res->reslock);
+       }
+       
+       return;
+}
+
+struct ckrm_res_ctlr laq_rcbs = {
+       .res_name          = "laq",
+       .resid             = -1 , // dynamically assigned
+       .res_alloc         = laq_res_alloc,
+       .res_free          = laq_res_free,
+       .set_share_values  = laq_set_share_values,
+       .get_share_values  = laq_get_share_values,
+       .get_stats         = laq_get_stats,
+       .change_resclass   = laq_change_resclass,
+       //      .res_initcls       = laq_res_initcls,         // LAQ_HUBERTUS: no need for this !!
+};
+
+int __init
+init_ckrm_laq_res(void)
+{
+       struct ckrm_classtype *clstype;
+       int resid;
+
+       clstype = ckrm_find_classtype_by_name("socket_class");
+       if (clstype == NULL) {
+               printk(KERN_INFO " Unknown ckrm classtype<socket_class>");
+               return -ENOENT;
+       }
+
+       if (my_resid == -1) {
+               resid = ckrm_register_res_ctlr(clstype,&laq_rcbs);
+               if (resid >= 0)
+                       my_resid = resid;
+               printk("........init_ckrm_listen_aq_res -> %d\n",my_resid);
+       }
+       return 0;
+
+}      
+
+void __exit
+exit_ckrm_laq_res(void)
+{
+       ckrm_unregister_res_ctlr(&laq_rcbs);
+       my_resid = -1;
+}
+
+
+module_init(init_ckrm_laq_res)
+module_exit(exit_ckrm_laq_res)
+
+MODULE_LICENSE("GPL");
+
diff --git a/kernel/ckrm/ckrm_sockc.c b/kernel/ckrm/ckrm_sockc.c
new file mode 100644 (file)
index 0000000..26731bb
--- /dev/null
@@ -0,0 +1,554 @@
+/* ckrm_sock.c - Class-based Kernel Resource Management (CKRM)
+ *
+ * Copyright (C) Hubertus Franke, IBM Corp. 2003,2004
+ *           (C) Shailabh Nagar,  IBM Corp. 2003
+ *           (C) Chandra Seetharaman,  IBM Corp. 2003
+ *          (C) Vivek Kashyap, IBM Corp. 2004
+ * 
+ * 
+ * Provides kernel API of CKRM for in-kernel,per-resource controllers 
+ * (one each for cpu, memory, io, network) and callbacks for 
+ * classification modules.
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Changes
+ *
+ * 28 Aug 2003
+ *        Created.
+ * 06 Nov 2003
+ *        Made modifications to suit the new RBCE module.
+ * 10 Nov 2003
+ *        Fixed a bug in fork and exit callbacks. Added callbacks_active and
+ *        surrounding logic. Added task paramter for all CE callbacks.
+ * 23 Mar 2004
+ *        moved to referenced counted class objects and correct locking
+ * 12 Apr 2004
+ *        introduced adopted to emerging classtype interface
+ */
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <asm/uaccess.h>
+#include <linux/mm.h>
+#include <asm/errno.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/ckrm_rc.h>
+#include <linux/parser.h>
+#include <net/tcp.h>
+
+#include <linux/ckrm_net.h>
+
+struct ckrm_sock_class {
+       struct ckrm_core_class core;
+};
+
+static struct ckrm_sock_class  sockclass_dflt_class = {
+};
+
+#define SOCKET_CLASS_TYPE_NAME  "socket_class"
+
+const char *dflt_sockclass_name = SOCKET_CLASS_TYPE_NAME;
+
+static struct ckrm_core_class *sock_alloc_class(struct ckrm_core_class *parent, const char *name);
+static int  sock_free_class(struct ckrm_core_class *core);
+
+static int  sock_forced_reclassify(ckrm_core_class_t *target, const char *resname);
+static int  sock_show_members(struct ckrm_core_class *core, struct seq_file *seq);
+static void sock_add_resctrl(struct ckrm_core_class *core, int resid);
+static void sock_reclassify_class(struct ckrm_sock_class *cls);
+
+struct ckrm_classtype CT_sockclass = {
+       .mfidx          = 1,
+       .name           = SOCKET_CLASS_TYPE_NAME,
+       .typeID         = CKRM_CLASSTYPE_SOCKET_CLASS, 
+       .maxdepth       = 3,                           
+       .resid_reserved = 0,                           
+       .max_res_ctlrs  = CKRM_MAX_RES_CTLRS,        
+       .max_resid      = 0,
+       .bit_res_ctlrs  = 0L,
+       .res_ctlrs_lock = SPIN_LOCK_UNLOCKED,
+       .classes        = LIST_HEAD_INIT(CT_sockclass.classes),
+
+       .default_class  = &sockclass_dflt_class.core,
+       
+       // private version of functions 
+       .alloc          = &sock_alloc_class,
+       .free           = &sock_free_class,
+       .show_members   = &sock_show_members,
+       .forced_reclassify = &sock_forced_reclassify,
+
+       // use of default functions 
+       .show_shares    = &ckrm_class_show_shares,
+       .show_stats     = &ckrm_class_show_stats,
+       .show_config    = &ckrm_class_show_config,
+       .set_config     = &ckrm_class_set_config,
+       .set_shares     = &ckrm_class_set_shares,
+       .reset_stats    = &ckrm_class_reset_stats,
+
+       // mandatory private version .. no dflt available
+       .add_resctrl    = &sock_add_resctrl,    
+};
+
+/* helper functions */
+
+void
+ckrm_ns_hold(struct ckrm_net_struct *ns)
+{
+        atomic_inc(&ns->ns_refcnt);
+        return;
+}
+
+void
+ckrm_ns_put(struct ckrm_net_struct *ns)
+{
+        if (atomic_dec_and_test(&ns->ns_refcnt))
+                kfree(ns);
+
+        return;
+}
+/*
+ * Change the class of a netstruct 
+ *
+ * Change the task's task class  to "newcls" if the task's current 
+ * class (task->taskclass) is same as given "oldcls", if it is non-NULL.
+ *
+ */
+
+static void
+sock_set_class(struct ckrm_net_struct *ns, struct ckrm_sock_class *newcls,
+             struct ckrm_sock_class *oldcls, enum ckrm_event event)
+{
+       int i;
+       struct ckrm_res_ctlr *rcbs;
+       struct ckrm_classtype *clstype;
+       void  *old_res_class, *new_res_class;
+
+       if ((newcls == oldcls) || (newcls == NULL)) {
+               ns->core = (void *)oldcls;
+               return;
+       }
+
+       class_lock(class_core(newcls));
+       ns->core = newcls;
+       list_add(&ns->ckrm_link, &class_core(newcls)->objlist);
+       class_unlock(class_core(newcls));
+
+       clstype = class_isa(newcls);                 
+       for (i = 0; i < clstype->max_resid; i++) {
+               atomic_inc(&clstype->nr_resusers[i]);
+               old_res_class = oldcls ? class_core(oldcls)->res_class[i] : NULL;
+               new_res_class = newcls ? class_core(newcls)->res_class[i] : NULL;
+               rcbs = clstype->res_ctlrs[i];
+               if (rcbs && rcbs->change_resclass && (old_res_class != new_res_class)) 
+                       (*rcbs->change_resclass)(ns, old_res_class, new_res_class);
+               atomic_dec(&clstype->nr_resusers[i]);
+       }
+       return;
+}
+
+static void
+sock_add_resctrl(struct ckrm_core_class *core, int resid)
+{
+       struct ckrm_net_struct *ns;
+       struct ckrm_res_ctlr *rcbs;
+
+       if ((resid < 0) || (resid >= CKRM_MAX_RES_CTLRS) || ((rcbs = core->classtype->res_ctlrs[resid]) == NULL)) 
+               return;
+
+       class_lock(core);
+       list_for_each_entry(ns, &core->objlist, ckrm_link) {
+               if (rcbs->change_resclass)
+                       (*rcbs->change_resclass)(ns, NULL, core->res_class[resid]);
+       }
+       class_unlock(core);
+}
+
+
+/**************************************************************************
+ *                   Functions called from classification points          *
+ **************************************************************************/
+
+static void
+cb_sockclass_listen_start(struct sock *sk)
+{
+       struct ckrm_net_struct *ns = NULL;
+       struct ckrm_sock_class *newcls = NULL;
+       struct ckrm_res_ctlr *rcbs;
+       struct ckrm_classtype *clstype;
+       int i = 0;
+
+       // XXX - TBD ipv6
+       if (sk->sk_family == IPPROTO_IPV6)
+               return;
+
+       // to store the socket address
+       ns = (struct ckrm_net_struct *)
+               kmalloc(sizeof(struct ckrm_net_struct), GFP_ATOMIC);
+       if (!ns)
+               return;
+
+       memset(ns,0, sizeof(ns));
+       INIT_LIST_HEAD(&ns->ckrm_link);
+
+       ns->ns_family = sk->sk_family;
+       if (ns->ns_family == IPPROTO_IPV6)      // IPv6 not supported yet.
+               return;
+
+       ns->ns_daddrv4 = inet_sk(sk)->rcv_saddr;
+       ns->ns_dport = inet_sk(sk)->num;
+               
+       ns->ns_pid = current->pid;
+       ns->ns_tgid = current->tgid;
+
+       ce_protect(&CT_sockclass);
+       CE_CLASSIFY_RET(newcls,&CT_sockclass,CKRM_EVENT_LISTEN_START,ns,current);
+       ce_release(&CT_sockclass);
+
+       if (newcls == NULL)  {
+               newcls = &sockclass_dflt_class;
+               ckrm_core_grab(class_core(newcls));
+       }
+
+       class_lock(class_core(newcls));
+       list_add(&ns->ckrm_link, &class_core(newcls)->objlist);
+       ckrm_ns_put(ns);
+       ns->core = newcls;
+       class_unlock(class_core(newcls));
+       
+
+       // the socket is already locked
+       // take a reference on socket on our behalf
+       sock_hold(sk);
+       sk->sk_ns = (void *)ns;
+       ns->ns_sk = sk;
+
+       // modify its shares
+       clstype = class_isa(newcls);
+       for (i = 0; i < clstype->max_resid; i++) {
+               atomic_inc(&clstype->nr_resusers[i]);
+               rcbs = clstype->res_ctlrs[i];
+               if (rcbs && rcbs->change_resclass) {
+                       (*rcbs->change_resclass)((void *)ns, 
+                                        NULL,class_core(newcls)->res_class[i]);
+               }
+               atomic_dec(&clstype->nr_resusers[i]);
+       }
+       return;
+}
+
+static void
+cb_sockclass_listen_stop(struct sock *sk)
+{
+       struct ckrm_net_struct *ns = NULL;
+       struct ckrm_sock_class *newcls = NULL;
+
+       // XXX - TBD ipv6
+       if (sk->sk_family == IPPROTO_IPV6)
+               return;
+
+       ns =  (struct ckrm_net_struct *)sk->sk_ns;
+       if (!ns) // listen_start called before socket_aq was loaded
+               return;
+
+       newcls = ns->core;
+       if (newcls) {
+               class_lock(class_core(newcls));
+               list_del(&ns->ckrm_link);
+               INIT_LIST_HEAD(&ns->ckrm_link);
+               class_unlock(class_core(newcls));
+               ckrm_core_drop(class_core(newcls));
+       }
+
+       // the socket is already locked
+       sk->sk_ns = NULL;
+       sock_put(sk);
+
+       // Should be the last count and free it
+       ckrm_ns_put(ns);
+       return;
+}
+
+static struct ckrm_event_spec sock_events_callbacks[] = {
+       CKRM_EVENT_SPEC( LISTEN_START, cb_sockclass_listen_start  ),
+       CKRM_EVENT_SPEC( LISTEN_STOP,  cb_sockclass_listen_stop  ),
+       { -1 }
+};
+
+/**************************************************************************
+ *                  Class Object Creation / Destruction
+ **************************************************************************/
+
+static struct ckrm_core_class *
+sock_alloc_class(struct ckrm_core_class *parent, const char *name)
+{
+       struct ckrm_sock_class *sockcls;
+       sockcls = kmalloc(sizeof(struct ckrm_sock_class), GFP_KERNEL);
+       if (sockcls == NULL) 
+               return NULL;
+
+       ckrm_init_core_class(&CT_sockclass,class_core(sockcls),parent,name);
+
+       ce_protect(&CT_sockclass);
+       if (CT_sockclass.ce_cb_active && CT_sockclass.ce_callbacks.class_add)
+               (*CT_sockclass.ce_callbacks.class_add)(name,sockcls);
+       ce_release(&CT_sockclass);
+
+       return class_core(sockcls);
+}
+
+static int
+sock_free_class(struct ckrm_core_class *core)
+{
+       struct ckrm_sock_class *sockcls;
+
+       if (!ckrm_is_core_valid(core)) {
+               // Invalid core
+               return (-EINVAL);
+       }
+       if (core == core->classtype->default_class) {
+               // reset the name tag
+               core->name = dflt_sockclass_name;
+               return 0;
+       }
+
+       sockcls = class_type(struct ckrm_sock_class, core);
+
+       ce_protect(&CT_sockclass);
+
+       if (CT_sockclass.ce_cb_active && CT_sockclass.ce_callbacks.class_delete)
+               (*CT_sockclass.ce_callbacks.class_delete)(core->name,sockcls);
+
+       sock_reclassify_class ( sockcls );
+
+       ce_release(&CT_sockclass);
+
+       ckrm_release_core_class(core);  // Hubertus .... could just drop the class .. error message
+       return 0;
+}
+
+
+static int                      
+sock_show_members(struct ckrm_core_class *core, struct seq_file *seq) 
+{
+       struct list_head *lh;
+       struct ckrm_net_struct *ns = NULL;
+
+       class_lock(core);
+       list_for_each(lh, &core->objlist) {
+               ns = container_of(lh, struct ckrm_net_struct,ckrm_link);
+               seq_printf(seq, "%d.%d.%d.%d\\%d\n", 
+                          NIPQUAD(ns->ns_daddrv4),ns->ns_dport);
+       }
+       class_unlock(core);
+
+       return 0;
+}
+
+static int
+sock_forced_reclassify_ns(struct ckrm_net_struct *tns, struct ckrm_core_class *core)
+{
+       struct ckrm_net_struct *ns = NULL;
+       struct sock *sk = NULL;
+       struct ckrm_sock_class *oldcls, *newcls;
+       int rc = -EINVAL;
+
+       if (!ckrm_is_core_valid(core)) {
+               return rc;
+       }
+
+       newcls = class_type(struct ckrm_sock_class, core);
+       // lookup the listening sockets
+       // returns with a reference count set on socket
+       sk = tcp_v4_lookup_listener(tns->ns_daddrv4,tns->ns_dport,0);
+       if (!sk) {
+               printk(KERN_INFO "No such listener 0x%x:%d\n",
+                               tns->ns_daddrv4, tns->ns_dport);
+               return rc;
+       }
+       lock_sock(sk);
+       if (!sk->sk_ns) {
+               goto out;
+       }
+       ns = sk->sk_ns;
+       ckrm_ns_hold(ns);
+       oldcls = ns->core;
+       if ((oldcls == NULL) || (oldcls == newcls)) {
+               ckrm_ns_put(ns);
+               goto out;
+       }
+
+       // remove the net_struct from the current class
+       class_lock(class_core(oldcls));
+       list_del(&ns->ckrm_link);
+       INIT_LIST_HEAD(&ns->ckrm_link);
+       ns->core = NULL;
+       class_unlock(class_core(oldcls));
+
+       sock_set_class(ns, newcls, oldcls, CKRM_EVENT_MANUAL);
+       ckrm_ns_put(ns);
+       rc = 0;
+out:
+       release_sock(sk);
+       sock_put(sk);
+
+       return rc;
+
+} 
+
+enum sock_target_token_t {
+        IPV4, IPV6, SOCKC_TARGET_ERR
+};
+
+static match_table_t sock_target_tokens = {
+       {IPV4, "ipv4=%s"},
+       {IPV6, "ipv6=%s"},
+        {SOCKC_TARGET_ERR, NULL},
+};
+
+char *
+v4toi(char *s, char c, __u32 *v)
+{
+       unsigned int  k = 0, n = 0;
+
+       while(*s && (*s != c)) {
+               if (*s == '.') {
+                       n <<= 8;
+                       n |= k;
+                       k = 0;
+               }
+               else 
+                       k = k *10 + *s - '0';
+               s++;
+       }
+
+       n <<= 8;
+       *v = n | k;
+
+       return s;
+}
+
+static int
+sock_forced_reclassify(struct ckrm_core_class *target,const char *options)
+{      
+       char *p,*p2;
+       struct ckrm_net_struct ns;
+       __u32 v4addr, tmp;
+
+       if (!options)
+               return 1;
+       
+       while ((p = strsep((char**)&options, ",")) != NULL) {
+               substring_t args[MAX_OPT_ARGS];
+               int token;
+               
+               if (!*p)
+                       continue;
+               token = match_token(p, sock_target_tokens, args);
+               switch (token) {
+
+               case IPV4:
+
+                       p2 = p;
+                       while(*p2 && (*p2 != '='))
+                               ++p2;
+                       p2++;
+                       p2 = v4toi(p2, '\\',&(v4addr));
+                       ns.ns_daddrv4 = htonl(v4addr);
+                       ns.ns_family = 4; //IPPROTO_IPV4
+                       p2 = v4toi(++p2, ':',&tmp); ns.ns_dport = (__u16)tmp;
+                       p2 = v4toi(++p2,'\0',&ns.ns_pid);
+                       
+                       sock_forced_reclassify_ns(&ns,target);
+                       break;
+
+               case IPV6:
+                       printk(KERN_INFO "rcfs: IPV6 not supported yet\n");
+                       return 0;       
+               default:
+                       return 0;
+               }
+       }
+       return 1;
+}      
+
+/*
+ * Listen_aq reclassification.
+ */
+static void
+sock_reclassify_class(struct ckrm_sock_class *cls)
+{
+       struct ckrm_net_struct *ns, *tns;
+       struct ckrm_core_class *core = class_core(cls);
+       LIST_HEAD(local_list);
+
+       if (!cls)
+               return;
+
+       if (!ckrm_validate_and_grab_core(core))
+               return;
+
+       class_lock(core);
+       // we have the core refcnt
+       if (list_empty(&core->objlist)) {
+               class_unlock(core);
+               ckrm_core_drop(core);
+               return;
+       }
+
+       INIT_LIST_HEAD(&local_list);
+       list_splice_init(&core->objlist, &local_list);
+       class_unlock(core);
+       ckrm_core_drop(core);
+       
+       list_for_each_entry_safe(ns, tns, &local_list, ckrm_link) {
+               ckrm_ns_hold(ns);
+               list_del(&ns->ckrm_link);
+               if (ns->ns_sk) {
+                       lock_sock(ns->ns_sk);
+                       sock_set_class(ns, &sockclass_dflt_class, NULL, CKRM_EVENT_MANUAL);
+                       release_sock(ns->ns_sk);
+               }
+               ckrm_ns_put(ns);
+       }
+       return ;
+}
+
+void __init
+ckrm_meta_init_sockclass(void)
+{
+       printk("...... Initializing ClassType<%s> ........\n",CT_sockclass.name);
+       // intialize the default class
+       ckrm_init_core_class(&CT_sockclass, class_core(&sockclass_dflt_class),
+                            NULL,dflt_sockclass_name);
+
+       // register classtype and initialize default task class
+       ckrm_register_classtype(&CT_sockclass);
+       ckrm_register_event_set(sock_events_callbacks);
+
+       // note registeration of all resource controllers will be done later dynamically 
+       // as these are specified as modules
+}
+
+
+
+#if 1
+
+/***************************************************************************************
+ * Debugging Network Classes:  Utility functions
+ **************************************************************************************/
+
+#endif
diff --git a/kernel/ckrm/ckrm_tasks.c b/kernel/ckrm/ckrm_tasks.c
new file mode 100644 (file)
index 0000000..dcc7ee3
--- /dev/null
@@ -0,0 +1,509 @@
+/* ckrm_numtasks.c - "Number of tasks" resource controller for CKRM
+ *
+ * Copyright (C) Chandra Seetharaman,  IBM Corp. 2003
+ * 
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Changes
+ * 
+ * 31 Mar 2004: Created
+ * 
+ */
+
+/*
+ * Code Description: TBD
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <asm/errno.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/ckrm.h>
+#include <linux/ckrm_rc.h>
+#include <linux/ckrm_tc.h>
+
+#define TOTAL_NUM_TASKS (131072) // 128 K
+#define NUMTASKS_DEBUG
+#define NUMTASKS_NAME "numtasks"
+
+typedef struct ckrm_numtasks {
+       struct ckrm_core_class *core; // the core i am part of...
+       struct ckrm_core_class *parent; // parent of the core above.
+       struct ckrm_shares shares;
+       spinlock_t cnt_lock; // always grab parent's lock first and then child's
+       int cnt_guarantee; // num_tasks guarantee in local units
+       int cnt_unused; // has to borrow if more than this is needed
+       int cnt_limit; // no tasks over this limit.
+       atomic_t cnt_cur_alloc; // current alloc from self
+       atomic_t cnt_borrowed; // borrowed from the parent
+
+       int over_guarantee; //turn on/off when cur_alloc goes over/under guarantee
+
+       // internally maintained statictics to compare with max numbers
+       int limit_failures; // no. of failures 'cause the request was over the limit
+       int borrow_sucesses; // no. of successful borrows
+       int borrow_failures; // no. of borrow faileures
+
+       // Maximum the specific statictics has reached.
+       int max_limit_failures;
+       int max_borrow_sucesses;
+       int max_borrow_failures;
+
+       // Total number of specific statistics
+       int tot_limit_failures;
+       int tot_borrow_sucesses;
+       int tot_borrow_failures;
+} ckrm_numtasks_t;
+
+struct ckrm_res_ctlr numtasks_rcbs;
+
+/* Initialize rescls values
+ * May be called on each rcfs unmount or as part of error recovery
+ * to make share values sane.
+ * Does not traverse hierarchy reinitializing children.
+ */
+static void
+numtasks_res_initcls_one(ckrm_numtasks_t *res)
+{
+       res->shares.my_guarantee     = CKRM_SHARE_DONTCARE;
+       res->shares.my_limit         = CKRM_SHARE_DONTCARE;
+       res->shares.total_guarantee  = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
+       res->shares.max_limit        = CKRM_SHARE_DFLT_MAX_LIMIT;
+       res->shares.unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
+       res->shares.cur_max_limit    = 0;
+
+       res->cnt_guarantee           = CKRM_SHARE_DONTCARE;
+       res->cnt_unused              = CKRM_SHARE_DONTCARE;
+       res->cnt_limit               = CKRM_SHARE_DONTCARE;
+
+       res->over_guarantee          = 0;
+
+       res->limit_failures          = 0;
+       res->borrow_sucesses         = 0;
+       res->borrow_failures         = 0;
+
+       res->max_limit_failures      = 0;
+       res->max_borrow_sucesses     = 0;
+       res->max_borrow_failures     = 0;
+
+       res->tot_limit_failures      = 0;
+       res->tot_borrow_sucesses     = 0;
+       res->tot_borrow_failures     = 0;
+
+       atomic_set(&res->cnt_cur_alloc, 0);
+       atomic_set(&res->cnt_borrowed, 0);
+       return;
+}
+
+#if 0  
+static void
+numtasks_res_initcls(void *my_res)
+{
+       ckrm_numtasks_t *res = my_res;
+
+       /* Write a version which propagates values all the way down 
+          and replace rcbs callback with that version */
+       
+}
+#endif
+
+int
+numtasks_get_ref(void *arg, int force)
+{
+       int rc, resid = numtasks_rcbs.resid;
+       ckrm_numtasks_t *res;
+       ckrm_core_class_t *core = arg;
+
+       if ((resid < 0) || (core == NULL))
+               return 1;
+
+       res = ckrm_get_res_class(core, resid, ckrm_numtasks_t);
+       if (res == NULL) 
+               return 1;
+
+       atomic_inc(&res->cnt_cur_alloc);
+
+       rc = 1;
+       if (((res->parent) && (res->cnt_unused == CKRM_SHARE_DONTCARE)) ||
+                       (atomic_read(&res->cnt_cur_alloc) > res->cnt_unused)) {
+
+               rc = 0;
+               if (!force && (res->cnt_limit != CKRM_SHARE_DONTCARE) && 
+                               (atomic_read(&res->cnt_cur_alloc) > res->cnt_limit)) {
+                       res->limit_failures++;
+                       res->tot_limit_failures++;
+               } else if (res->parent != NULL) {
+                       if ((rc = numtasks_get_ref(res->parent, force)) == 1) {
+                               atomic_inc(&res->cnt_borrowed);
+                               res->borrow_sucesses++;
+                               res->tot_borrow_sucesses++;
+                               res->over_guarantee = 1;
+                       } else {
+                               res->borrow_failures++;
+                               res->tot_borrow_failures++;
+                       }
+               } else {
+                       rc = force;
+               }
+       } else if (res->over_guarantee) {
+               res->over_guarantee = 0;
+
+               if (res->max_limit_failures < res->limit_failures) {
+                       res->max_limit_failures = res->limit_failures;
+               }
+               if (res->max_borrow_sucesses < res->borrow_sucesses) {
+                       res->max_borrow_sucesses = res->borrow_sucesses;
+               }
+               if (res->max_borrow_failures < res->borrow_failures) {
+                       res->max_borrow_failures = res->borrow_failures;
+               }
+               res->limit_failures = 0;
+               res->borrow_sucesses = 0;
+               res->borrow_failures = 0;
+       }
+
+       if (!rc) {
+               atomic_dec(&res->cnt_cur_alloc);
+       }
+       return rc;
+}
+
+void
+numtasks_put_ref(void *arg)
+{
+       int resid = numtasks_rcbs.resid;
+       ckrm_numtasks_t *res;
+       ckrm_core_class_t *core = arg;
+
+       if ((resid == -1) || (core == NULL)) {
+               return;
+       }
+
+       res = ckrm_get_res_class(core, resid, ckrm_numtasks_t);
+       if (res == NULL) 
+               return;
+       atomic_dec(&res->cnt_cur_alloc);
+       if (atomic_read(&res->cnt_borrowed) > 0) {
+               atomic_dec(&res->cnt_borrowed);
+               numtasks_put_ref(res->parent);
+       }
+       return;
+}
+
+static void *
+numtasks_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent)
+{
+       ckrm_numtasks_t *res;
+       
+       res = kmalloc(sizeof(ckrm_numtasks_t), GFP_ATOMIC);
+       
+       if (res) {
+               res->core = core;
+               res->parent = parent;
+               numtasks_res_initcls_one(res);
+               res->cnt_lock = SPIN_LOCK_UNLOCKED;
+               if (parent == NULL) {
+                       // I am part of root class. so set the max tasks to available
+                       // default
+                       res->cnt_guarantee = TOTAL_NUM_TASKS;
+                       res->cnt_unused =  TOTAL_NUM_TASKS;
+                       res->cnt_limit = TOTAL_NUM_TASKS;
+               }
+       } else {
+               printk(KERN_ERR "numtasks_res_alloc: failed GFP_ATOMIC alloc\n");
+       }
+       return res;
+}
+
+/*
+ * No locking of this resource class object necessary as we are not
+ * supposed to be assigned (or used) when/after this function is called.
+ */
+static void
+numtasks_res_free(void *my_res)
+{
+       ckrm_numtasks_t *res = my_res, *parres, *childres;
+       ckrm_core_class_t *child = NULL;
+       int i, borrowed, maxlimit, resid = numtasks_rcbs.resid;
+
+       if (!res) 
+               return;
+
+       // Assuming there will be no children when this function is called
+       
+       parres = ckrm_get_res_class(res->parent, resid, ckrm_numtasks_t);
+
+       if (unlikely(atomic_read(&res->cnt_cur_alloc) != 0 ||
+                               atomic_read(&res->cnt_borrowed))) {
+               printk(KERN_ERR "numtasks_res_free: resource still alloc'd %p\n", res);
+               if ((borrowed = atomic_read(&res->cnt_borrowed)) > 0) {
+                       for (i = 0; i < borrowed; i++) {
+                               numtasks_put_ref(parres->core);
+                       }
+               }
+       }
+
+       // return child's limit/guarantee to parent node
+       spin_lock(&parres->cnt_lock);
+       child_guarantee_changed(&parres->shares, res->shares.my_guarantee, 0);
+
+       // run thru parent's children and get the new max_limit of the parent
+       ckrm_lock_hier(parres->core);
+       maxlimit = 0;
+       while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
+               childres = ckrm_get_res_class(child, resid, ckrm_numtasks_t);
+               if (maxlimit < childres->shares.my_limit) {
+                       maxlimit = childres->shares.my_limit;
+               }
+       }
+       ckrm_unlock_hier(parres->core);
+       if (parres->shares.cur_max_limit < maxlimit) {
+               parres->shares.cur_max_limit = maxlimit;
+       }
+
+       spin_unlock(&parres->cnt_lock);
+       kfree(res);
+       return;
+}
+/*
+ * Recalculate the guarantee and limit in real units... and propagate the
+ * same to children.
+ * Caller is responsible for protecting res and for the integrity of parres
+ */
+static void
+recalc_and_propagate(ckrm_numtasks_t *res, ckrm_numtasks_t *parres)
+{
+       ckrm_core_class_t *child = NULL;
+       ckrm_numtasks_t *childres;
+       int resid = numtasks_rcbs.resid;
+
+       if (parres) {
+               struct ckrm_shares *par = &parres->shares;
+               struct ckrm_shares *self = &res->shares;
+
+               // calculate cnt_guarantee and cnt_limit
+               //
+               if (parres->cnt_guarantee == CKRM_SHARE_DONTCARE) {
+                       res->cnt_guarantee = CKRM_SHARE_DONTCARE;
+               } else {
+                       res->cnt_guarantee = (self->my_guarantee * parres->cnt_guarantee) 
+                                       / par->total_guarantee;
+               }
+               if (parres->cnt_limit == CKRM_SHARE_DONTCARE) {
+                       res->cnt_limit = CKRM_SHARE_DONTCARE;
+               } else {
+                       res->cnt_limit = (self->my_limit * parres->cnt_limit)
+                                       / par->max_limit;
+               }
+
+               // Calculate unused units
+               if (res->cnt_guarantee == CKRM_SHARE_DONTCARE) {
+                       res->cnt_unused = CKRM_SHARE_DONTCARE;
+               } else {
+                       res->cnt_unused = (self->unused_guarantee *
+                                       res->cnt_guarantee) / self->total_guarantee;
+               }
+       }
+
+       // propagate to children
+       ckrm_lock_hier(res->core);
+       while ((child = ckrm_get_next_child(res->core, child)) != NULL) {
+               childres = ckrm_get_res_class(child, resid, ckrm_numtasks_t);
+
+               spin_lock(&childres->cnt_lock);
+               recalc_and_propagate(childres, res);
+               spin_unlock(&childres->cnt_lock);
+       }
+       ckrm_unlock_hier(res->core);
+       return;
+}
+
+static int
+numtasks_set_share_values(void *my_res, struct ckrm_shares *new)
+{
+       ckrm_numtasks_t *parres, *res = my_res;
+       struct ckrm_shares *cur = &res->shares, *par;
+       int rc = -EINVAL, resid = numtasks_rcbs.resid;
+
+       if (!res) 
+               return rc;
+
+       if (res->parent) {
+               parres = ckrm_get_res_class(res->parent, resid, ckrm_numtasks_t);
+               spin_lock(&parres->cnt_lock);
+               spin_lock(&res->cnt_lock);
+               par = &parres->shares;
+       } else {
+               spin_lock(&res->cnt_lock);
+               par = NULL;
+               parres = NULL;
+       }
+
+       rc = set_shares(new, cur, par);
+
+       if ((rc == 0) && parres) {
+               // Calculate parent's unused units
+               if (parres->cnt_guarantee == CKRM_SHARE_DONTCARE) {
+                       parres->cnt_unused = CKRM_SHARE_DONTCARE;
+               } else {
+                       parres->cnt_unused = (par->unused_guarantee *
+                                       parres->cnt_guarantee) / par->total_guarantee;
+               }
+
+               recalc_and_propagate(res, parres);
+       }
+       spin_unlock(&res->cnt_lock);
+       if (res->parent) {
+               spin_unlock(&parres->cnt_lock);
+       }
+       return rc;
+}
+
+
+static int
+numtasks_get_share_values(void *my_res, struct ckrm_shares *shares)
+{
+       ckrm_numtasks_t *res = my_res;
+
+       if (!res) 
+               return -EINVAL;
+       *shares = res->shares;
+       return 0;
+}
+
+static int  
+numtasks_get_stats(void *my_res, struct seq_file *sfile)
+{
+       ckrm_numtasks_t *res = my_res;
+
+       if (!res) 
+               return -EINVAL;
+
+       seq_printf(sfile, "Number of tasks resource:\n");
+       seq_printf(sfile, "Total Over limit failures: %d\n",
+                       res->tot_limit_failures);
+       seq_printf(sfile, "Total Over guarantee sucesses: %d\n",
+                       res->tot_borrow_sucesses);
+       seq_printf(sfile, "Total Over guarantee failures: %d\n",
+                       res->tot_borrow_failures);
+
+       seq_printf(sfile, "Maximum Over limit failures: %d\n",
+                       res->max_limit_failures);
+       seq_printf(sfile, "Maximum Over guarantee sucesses: %d\n",
+                       res->max_borrow_sucesses);
+       seq_printf(sfile, "Maximum Over guarantee failures: %d\n",
+                       res->max_borrow_failures);
+#ifdef NUMTASKS_DEBUG
+       seq_printf(sfile, "cur_alloc %d; borrowed %d; cnt_guar %d; cnt_limit %d "
+                       "unused_guarantee %d, cur_max_limit %d\n",
+                       atomic_read(&res->cnt_cur_alloc),
+                       atomic_read(&res->cnt_borrowed),
+                       res->cnt_guarantee,
+                       res->cnt_limit,
+                       res->shares.unused_guarantee,
+                       res->shares.cur_max_limit);
+#endif
+
+       return 0;
+}
+
+static int  
+numtasks_show_config(void *my_res, struct seq_file *sfile)
+{
+       ckrm_numtasks_t *res = my_res;
+
+       if (!res) 
+               return -EINVAL;
+
+       seq_printf(sfile, "res=%s,parameter=somevalue\n",NUMTASKS_NAME);
+       return 0;
+}
+
+static int  
+numtasks_set_config(void *my_res, const char *cfgstr)
+{
+       ckrm_numtasks_t *res = my_res;
+
+       if (!res) 
+               return -EINVAL;
+       printk("numtasks config='%s'\n",cfgstr);
+       return 0;
+}
+
+static void
+numtasks_change_resclass(void *task, void *old, void *new)
+{
+       ckrm_numtasks_t *oldres = old;
+       ckrm_numtasks_t *newres = new;
+
+       if (oldres != (void *) -1) {
+               struct task_struct *tsk = task;
+               if (!oldres) {
+                       struct ckrm_core_class *old_core = &(tsk->parent->taskclass->core);
+                       oldres = ckrm_get_res_class(old_core, numtasks_rcbs.resid,
+                                       ckrm_numtasks_t);
+               }
+               numtasks_put_ref(oldres->core);
+       }
+       if (newres) {
+               (void) numtasks_get_ref(newres->core, 1);
+       }
+}
+
+struct ckrm_res_ctlr numtasks_rcbs = {
+       .res_name          = NUMTASKS_NAME,
+       .res_hdepth        = 1,
+       .resid             = -1,
+       .res_alloc         = numtasks_res_alloc,
+       .res_free          = numtasks_res_free,
+       .set_share_values  = numtasks_set_share_values,
+       .get_share_values  = numtasks_get_share_values,
+       .get_stats         = numtasks_get_stats,
+       .show_config       = numtasks_show_config,
+       .set_config        = numtasks_set_config,
+       .change_resclass   = numtasks_change_resclass,
+};
+
+int __init
+init_ckrm_numtasks_res(void)
+{
+       struct ckrm_classtype *clstype;
+       int resid = numtasks_rcbs.resid;
+
+       clstype = ckrm_find_classtype_by_name("taskclass");
+       if (clstype == NULL) {
+               printk(KERN_INFO " Unknown ckrm classtype<taskclass>");
+               return -ENOENT;
+       }
+
+       if (resid == -1) {
+               resid = ckrm_register_res_ctlr(clstype,&numtasks_rcbs);
+               printk("........init_ckrm_numtasks_res -> %d\n",resid);
+       }
+       return 0;
+}      
+
+void __exit
+exit_ckrm_numtasks_res(void)
+{
+       ckrm_unregister_res_ctlr(&numtasks_rcbs);
+       numtasks_rcbs.resid = -1;
+}
+
+module_init(init_ckrm_numtasks_res)
+module_exit(exit_ckrm_numtasks_res)
+
+EXPORT_SYMBOL(numtasks_get_ref);
+EXPORT_SYMBOL(numtasks_put_ref);
+
+MODULE_LICENSE("GPL");
+
diff --git a/kernel/ckrm/ckrm_tc.c b/kernel/ckrm/ckrm_tc.c
new file mode 100644 (file)
index 0000000..cc03778
--- /dev/null
@@ -0,0 +1,785 @@
+/* ckrm_tc.c - Class-based Kernel Resource Management (CKRM)
+ *
+ * Copyright (C) Hubertus Franke, IBM Corp. 2003,2004
+ *           (C) Shailabh Nagar,  IBM Corp. 2003
+ *           (C) Chandra Seetharaman,  IBM Corp. 2003
+ *          (C) Vivek Kashyap, IBM Corp. 2004
+ * 
+ * 
+ * Provides kernel API of CKRM for in-kernel,per-resource controllers 
+ * (one each for cpu, memory, io, network) and callbacks for 
+ * classification modules.
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Changes
+ *
+ * 28 Aug 2003
+ *        Created.
+ * 06 Nov 2003
+ *        Made modifications to suit the new RBCE module.
+ * 10 Nov 2003
+ *        Fixed a bug in fork and exit callbacks. Added callbacks_active and
+ *        surrounding logic. Added task paramter for all CE callbacks.
+ * 23 Mar 2004
+ *        moved to referenced counted class objects and correct locking
+ * 12 Apr 2004
+ *        introduced adopted to emerging classtype interface
+ */
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <asm/uaccess.h>
+#include <linux/mm.h>
+#include <asm/errno.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/ckrm_rc.h>
+
+#include <linux/ckrm_tc.h>
+
+
+
+#define TC_DEBUG(fmt, args...) do { /* printk("%s: " fmt, __FUNCTION__ , ## args); */ } while (0)
+
+
+static struct ckrm_task_class  taskclass_dflt_class = {
+};
+
+const char *dflt_taskclass_name = TASK_CLASS_TYPE_NAME;
+
+static struct ckrm_core_class *ckrm_alloc_task_class(struct ckrm_core_class *parent, const char *name);
+static int ckrm_free_task_class(struct ckrm_core_class *core);
+
+static int  tc_forced_reclassify(ckrm_core_class_t *target, const char *resname);
+static int  tc_show_members(struct ckrm_core_class *core, struct seq_file *seq);
+static void tc_add_resctrl(struct ckrm_core_class *core, int resid);
+
+struct ckrm_classtype CT_taskclass = {
+       .mfidx          = TC_MF_IDX,
+       .name           = TASK_CLASS_TYPE_NAME,
+       .typeID         = CKRM_CLASSTYPE_TASK_CLASS, 
+       .maxdepth       = 3,                           // Hubertus .. just to start 
+       .resid_reserved = 4,                           // Hubertus .. reservation
+       .max_res_ctlrs  = CKRM_MAX_RES_CTLRS,        
+       .max_resid      = 0,
+       .bit_res_ctlrs  = 0L,
+       .res_ctlrs_lock = SPIN_LOCK_UNLOCKED,
+       .classes        = LIST_HEAD_INIT(CT_taskclass.classes),
+
+       .default_class  = &taskclass_dflt_class.core,
+       
+       // private version of functions 
+       .alloc          = &ckrm_alloc_task_class,
+       .free           = &ckrm_free_task_class,
+       .show_members   = &tc_show_members,
+       .forced_reclassify = &tc_forced_reclassify,
+
+       // use of default functions 
+       .show_shares    = &ckrm_class_show_shares,
+       .show_stats     = &ckrm_class_show_stats,
+       .show_config    = &ckrm_class_show_config,
+       .set_config     = &ckrm_class_set_config,
+       .set_shares     = &ckrm_class_set_shares,
+       .reset_stats    = &ckrm_class_reset_stats,
+
+       // mandatory private version .. no dflt available
+       .add_resctrl    = &tc_add_resctrl,      
+};
+
+/**************************************************************************
+ *                   Helper Functions                                     *
+ **************************************************************************/
+
+static inline void
+ckrm_init_task_lock(struct task_struct *tsk)
+{
+       tsk->ckrm_tsklock = SPIN_LOCK_UNLOCKED;
+}
+
+// Hubertus .. following functions should move to ckrm_rc.h
+
+static inline void
+ckrm_task_lock(struct task_struct *tsk)
+{
+       spin_lock(&tsk->ckrm_tsklock);
+}
+
+static inline void
+ckrm_task_unlock(struct task_struct *tsk)
+{
+       spin_unlock(&tsk->ckrm_tsklock);
+}
+
+/*
+ * Change the task class of the given task.
+ *
+ * Change the task's task class  to "newcls" if the task's current 
+ * class (task->taskclass) is same as given "oldcls", if it is non-NULL.
+ *
+ * Caller is responsible to make sure the task structure stays put through
+ * this function.
+ *
+ * This function should be called with the following locks NOT held
+ *     - tsk->ckrm_task_lock
+ *     - core->ckrm_lock, if core is NULL then ckrm_dflt_class.ckrm_lock
+ *     - tsk->taskclass->ckrm_lock 
+ * 
+ * Function is also called with a ckrm_core_grab on the new core, hence
+ * it needs to be dropped if no assignment takes place.
+ */
+
+static void
+ckrm_set_taskclass(struct task_struct *tsk, ckrm_task_class_t *newcls, 
+                  ckrm_task_class_t *oldcls, enum ckrm_event event)
+{
+       int i;
+       ckrm_classtype_t  *clstype;
+       ckrm_res_ctlr_t   *rcbs;
+       ckrm_task_class_t *curcls;
+       void *old_res_class, *new_res_class;
+       int drop_old_cls;
+
+       ckrm_task_lock(tsk);
+       curcls = tsk->taskclass;
+
+       // check whether compare_and_exchange should
+       if (oldcls && (oldcls != curcls)) {
+               ckrm_task_unlock(tsk);
+               if (newcls) {
+                       /* compensate for previous grab */
+                       TC_DEBUG("(%s:%d): Race-condition caught <%s> %d\n",
+                                tsk->comm,tsk->pid,class_core(newcls)->name,event);
+                       ckrm_core_drop(class_core(newcls));
+               }
+               return;
+       }
+
+       // make sure we have a real destination core
+       if (!newcls) {
+               newcls = &taskclass_dflt_class;
+               ckrm_core_grab(class_core(newcls));
+       }
+
+       // take out of old class 
+       // remember that we need to drop the oldcore
+       if ((drop_old_cls = (curcls != NULL))) {
+               class_lock(class_core(curcls));
+               if (newcls == curcls) {
+                       // we are already in the destination class.
+                       // we still need to drop oldcore
+                       class_unlock(class_core(curcls));
+                       ckrm_task_unlock(tsk);
+                       goto out;
+               }
+               list_del(&tsk->taskclass_link);
+               INIT_LIST_HEAD(&tsk->taskclass_link);
+               tsk->taskclass = NULL;
+               class_unlock(class_core(curcls));
+       }       
+
+       // put into new class 
+       class_lock(class_core(newcls));
+       tsk->taskclass = newcls;
+       list_add(&tsk->taskclass_link, &class_core(newcls)->objlist);
+       class_unlock(class_core(newcls));
+
+       if (newcls == curcls) {
+               ckrm_task_unlock(tsk);
+               goto out;
+       }
+
+       CE_NOTIFY(&CT_taskclass,event,newcls,tsk);
+
+       ckrm_task_unlock(tsk);
+
+       clstype = class_isa(newcls);                      // Hubertus .. can hardcode ckrm_CT_taskclass
+       if (clstype->bit_res_ctlrs) {   // avoid running through the entire list if non is registered
+               for (i = 0; i < clstype->max_resid; i++) {
+                       if (clstype->res_ctlrs[i] == NULL) 
+                               continue;
+                       atomic_inc(&clstype->nr_resusers[i]);
+                       old_res_class = curcls ? class_core(curcls)->res_class[i] : NULL;
+                       new_res_class = newcls ? class_core(newcls)->res_class[i] : NULL;
+                       rcbs = clstype->res_ctlrs[i];
+                       if (rcbs && rcbs->change_resclass && (old_res_class != new_res_class)) 
+                               (*rcbs->change_resclass)(tsk, old_res_class, new_res_class);
+                       atomic_dec(&clstype->nr_resusers[i]);
+               }
+       }
+
+ out:
+       if (drop_old_cls) 
+               ckrm_core_drop(class_core(curcls));
+       return;
+}
+
+// HF SUGGEST: we could macro-tize this for other types DEF_FUNC_ADD_RESCTRL(funcname,link)
+//          would DEF_FUNC_ADD_RESCTRL(tc_add_resctrl,taskclass_link)
+
+static void
+tc_add_resctrl(struct ckrm_core_class *core, int resid)
+{
+       struct task_struct *tsk;
+       struct ckrm_res_ctlr *rcbs;
+
+       if ((resid < 0) || (resid >= CKRM_MAX_RES_CTLRS) || ((rcbs = core->classtype->res_ctlrs[resid]) == NULL)) 
+               return;
+
+       class_lock(core);
+       list_for_each_entry(tsk, &core->objlist, taskclass_link) {
+               if (rcbs->change_resclass)
+                       (*rcbs->change_resclass)(tsk, (void *) -1, core->res_class[resid]);
+       }
+       class_unlock(core);
+}
+
+
+/**************************************************************************
+ *                   Functions called from classification points          *
+ **************************************************************************/
+
+#define ECB_PRINTK(fmt, args...) // do { if (CT_taskclass.ce_regd) printk("%s: " fmt, __FUNCTION__ , ## args); } while (0)
+
+#define CE_CLASSIFY_TASK(event, tsk)                                           \
+do {                                                                           \
+       struct ckrm_task_class *newcls = NULL, *oldcls = tsk->taskclass;        \
+                                                                               \
+       CE_CLASSIFY_RET(newcls,&CT_taskclass,event,tsk);                        \
+       if (newcls) {                                                           \
+               /* called synchrously. no need to get task struct */            \
+               ckrm_set_taskclass(tsk, newcls, oldcls, event);                 \
+       }                                                                       \
+} while (0)
+
+#define CE_CLASSIFY_TASK_PROTECT(event, tsk)   \
+do {                                           \
+       ce_protect(&CT_taskclass);              \
+       CE_CLASSIFY_TASK(event,tsk);            \
+       ce_release(&CT_taskclass);              \
+} while (0)
+
+
+
+
+static void
+cb_taskclass_newtask(struct task_struct *tsk)
+{
+       tsk->taskclass = NULL;
+       INIT_LIST_HEAD(&tsk->taskclass_link);
+}
+
+
+static void
+cb_taskclass_fork(struct task_struct *tsk)
+{
+       struct ckrm_task_class *cls = NULL;
+
+       ECB_PRINTK("%p:%d:%s\n",tsk,tsk->pid,tsk->comm);
+
+       ce_protect(&CT_taskclass);
+       CE_CLASSIFY_RET(cls,&CT_taskclass,CKRM_EVENT_FORK,tsk);      
+       if (cls == NULL) {
+               ckrm_task_lock(tsk->parent);
+               cls = tsk->parent->taskclass;
+               ckrm_core_grab(class_core(cls));
+               ckrm_task_unlock(tsk->parent);
+       }
+       if (!list_empty(&tsk->taskclass_link))
+               printk("BUG in cb_fork.. tsk (%s:%d> already linked\n",
+                       tsk->comm,tsk->pid);
+
+       ckrm_set_taskclass(tsk, cls, NULL, CKRM_EVENT_FORK);
+       ce_release(&CT_taskclass);
+}
+
+static void
+cb_taskclass_exit(struct task_struct *tsk)
+{
+       ckrm_task_class_t *cls;
+
+       // Remove the task from the current core class
+       
+       ECB_PRINTK("%p:%d:%s\n",tsk,tsk->pid,tsk->comm);
+       ckrm_task_lock(tsk);
+
+       CE_CLASSIFY_NORET( &CT_taskclass, CKRM_EVENT_EXIT, tsk);
+
+       if ((cls = tsk->taskclass) != NULL) {
+               class_lock(class_core(cls));
+               tsk->taskclass = NULL;
+               list_del(&tsk->taskclass_link);
+               class_unlock(class_core(cls));
+               ckrm_core_drop(class_core(cls));
+       } else {
+               INIT_LIST_HEAD(&tsk->taskclass_link);
+       }
+       ckrm_task_unlock(tsk);
+}
+
+static void
+cb_taskclass_exec(const char *filename)
+{
+       ECB_PRINTK("%p:%d:%s <%s>\n",current,current->pid,current->comm,filename);
+       CE_CLASSIFY_TASK_PROTECT(CKRM_EVENT_EXEC, current);
+}
+
+static void
+cb_taskclass_uid(void)
+{
+       ECB_PRINTK("%p:%d:%s\n",current,current->pid,current->comm);
+       CE_CLASSIFY_TASK_PROTECT(CKRM_EVENT_UID, current);
+}
+
+static void
+cb_taskclass_gid(void)
+{
+       ECB_PRINTK("%p:%d:%s\n",current,current->pid,current->comm);
+       CE_CLASSIFY_TASK_PROTECT(CKRM_EVENT_GID, current);
+}
+
+static struct ckrm_event_spec taskclass_events_callbacks[] = {
+       CKRM_EVENT_SPEC( NEWTASK, cb_taskclass_newtask ),
+       CKRM_EVENT_SPEC( EXEC   , cb_taskclass_exec ),
+       CKRM_EVENT_SPEC( FORK   , cb_taskclass_fork ),
+       CKRM_EVENT_SPEC( EXIT   , cb_taskclass_exit ),
+       CKRM_EVENT_SPEC( UID    , cb_taskclass_uid  ),
+       CKRM_EVENT_SPEC( GID    , cb_taskclass_gid  ),
+       { -1 }
+};
+
+/***********************************************************************
+ *
+ * Asynchronous callback functions   (driven by RCFS)
+ * 
+ *    Async functions force a setting of the task structure
+ *    synchronous callbacks are protected against race conditions 
+ *    by using a cmpxchg on the core before setting it.
+ *    Async calls need to be serialized to ensure they can't 
+ *    race against each other 
+ *
+ ***********************************************************************/
+
+DECLARE_MUTEX(async_serializer);    // serialize all async functions
+
+
+/*
+ * Go through the task list and reclassify all tasks according to the current
+ * classification rules.
+ *
+ * We have the problem that we can not hold any lock (including the 
+ * tasklist_lock) while classifying. Two methods possible
+ *
+ * (a) go through entire pidrange (0..pidmax) and if a task exists at 
+ *     that pid then reclassify it
+ * (b) go several time through task list and build a bitmap for a particular 
+ *     subrange of pid otherwise the memory requirements ight be too much.
+ * 
+ * We use a hybrid by comparing ratio nr_threads/pidmax
+ */
+
+static void
+ckrm_reclassify_all_tasks(void)
+{
+       extern int pid_max;
+
+       struct task_struct *proc, *thread;
+       int i;
+       int curpidmax = pid_max;
+       int ratio;
+       int use_bitmap;
+
+
+       ratio = curpidmax / nr_threads;
+       if (curpidmax <= PID_MAX_DEFAULT) {
+            use_bitmap = 1;
+       } else {
+            use_bitmap = (ratio >= 2);
+       }
+
+       ce_protect(&CT_taskclass);
+
+ retry:                
+       if (use_bitmap == 0) {
+               // go through it in one walk
+               read_lock(&tasklist_lock);
+               for ( i=0 ; i<curpidmax ; i++ ) {
+                       if ((thread = find_task_by_pid(i)) == NULL) 
+                               continue;
+                       get_task_struct(thread);
+                       read_unlock(&tasklist_lock);
+                       CE_CLASSIFY_TASK(CKRM_EVENT_RECLASSIFY, thread);
+                       put_task_struct(thread);
+                       read_lock(&tasklist_lock);
+               }
+               read_unlock(&tasklist_lock);
+       } else {
+               unsigned long *bitmap;
+               int bitmapsize;
+               int order = 0;
+               int num_loops;
+               int pid, do_next;
+
+
+               bitmap = (unsigned long*) __get_free_pages(GFP_KERNEL,order);
+               if (bitmap == NULL) {
+                       use_bitmap = 0;
+                       goto retry;
+               }
+
+               bitmapsize = 8 * (1 << (order + PAGE_SHIFT));
+               num_loops  = (curpidmax + bitmapsize - 1) / bitmapsize;
+
+               do_next = 1;
+               for ( i=0 ; i < num_loops && do_next; i++) {
+                       int pid_start = i*bitmapsize; 
+                       int pid_end   = pid_start + bitmapsize;
+                       int num_found = 0;
+                       int pos;
+
+                       memset(bitmap, 0, bitmapsize/8); // start afresh
+                       do_next = 0;
+
+                       read_lock(&tasklist_lock);
+                       do_each_thread(proc, thread) {
+                               pid = thread->pid;
+                               if ((pid < pid_start) || (pid >= pid_end)) {
+                                       if (pid >= pid_end) {
+                                               do_next = 1;
+                                       }
+                                       continue;
+                               }
+                               pid -= pid_start;
+                               set_bit(pid, bitmap);
+                               num_found++;
+                       } while_each_thread(proc, thread);
+                       read_unlock(&tasklist_lock);
+               
+                       if (num_found == 0) 
+                               continue;
+
+                       pos = 0;
+                       for ( ; num_found-- ; ) {
+                               pos = find_next_bit(bitmap, bitmapsize, pos);
+                               pid = pos + pid_start;
+
+                               read_lock(&tasklist_lock);
+                               if ((thread = find_task_by_pid(pid)) != NULL) {
+                                       get_task_struct(thread);
+                                       read_unlock(&tasklist_lock);
+                                       CE_CLASSIFY_TASK(CKRM_EVENT_RECLASSIFY, thread);
+                                       put_task_struct(thread);
+                               } else {
+                                       read_unlock(&tasklist_lock);
+                               }
+                       }
+               }
+
+       }
+       ce_release(&CT_taskclass);
+}
+
+int
+ckrm_reclassify(int pid)
+{
+       struct task_struct *tsk;
+       int rc = 0;
+
+       down(&async_serializer);   // protect again race condition
+       if (pid < 0) {
+               // do we want to treat this as process group .. should YES ToDo
+                rc = -EINVAL;
+       } else if (pid == 0) {
+               // reclassify all tasks in the system
+               ckrm_reclassify_all_tasks();
+       } else {
+               // reclassify particular pid
+               read_lock(&tasklist_lock);
+               if ((tsk = find_task_by_pid(pid)) != NULL) {
+                       get_task_struct(tsk);
+                       read_unlock(&tasklist_lock);
+                       CE_CLASSIFY_TASK_PROTECT(CKRM_EVENT_RECLASSIFY, tsk);
+                       put_task_struct(tsk);
+               } else {
+                       read_unlock(&tasklist_lock);
+                       rc = -EINVAL;
+               }
+       }
+       up(&async_serializer);
+       return rc;
+}
+
+/*
+ * Reclassify all tasks in the given core class.
+ */
+
+static void
+ckrm_reclassify_class_tasks(struct ckrm_task_class *cls)
+{
+       int ce_regd;
+       struct ckrm_hnode *cnode;
+       struct ckrm_task_class *parcls;
+       int num = 0;
+
+       if (!ckrm_validate_and_grab_core(&cls->core))
+               return;
+
+       down(&async_serializer);   // protect again race condition
+
+
+       TC_DEBUG("start %p:%s:%d:%d\n",cls,cls->core.name, 
+                atomic_read(&cls->core.refcnt),atomic_read(&cls->core.hnode.parent->refcnt));
+       // If no CE registered for this classtype, following will be needed repeatedly;
+       ce_regd =  class_core(cls)->classtype->ce_regd;
+       cnode = &(class_core(cls)->hnode);
+       parcls = class_type(ckrm_task_class_t, cnode->parent);
+
+next_task:
+       class_lock(class_core(cls));
+       if (!list_empty(&class_core(cls)->objlist)) {
+               struct ckrm_task_class *newcls = NULL;
+               struct task_struct *tsk = 
+                       list_entry(class_core(cls)->objlist.next,
+                                  struct task_struct, taskclass_link);
+               
+               get_task_struct(tsk);
+               class_unlock(class_core(cls));
+
+               if (ce_regd) {
+                       CE_CLASSIFY_RET(newcls,&CT_taskclass,CKRM_EVENT_RECLASSIFY,tsk); 
+                       if (cls == newcls) {
+                               // don't allow reclassifying to the same class
+                               // as we are in the process of cleaning up this class
+                               ckrm_core_drop(class_core(newcls)); // to compensate CE's grab
+                               newcls = NULL;
+                       }
+               }
+               if (newcls == NULL) {
+                       newcls = parcls;
+                       ckrm_core_grab(class_core(newcls));
+               }
+               ckrm_set_taskclass(tsk, newcls, cls, CKRM_EVENT_RECLASSIFY);
+               put_task_struct(tsk);
+               num++;
+               goto next_task;
+       }
+       TC_DEBUG("stop  %p:%s:%d:%d   %d\n",cls,cls->core.name,
+                atomic_read(&cls->core.refcnt),atomic_read(&cls->core.hnode.parent->refcnt),num);
+       class_unlock(class_core(cls));
+       ckrm_core_drop(class_core(cls));
+
+       up(&async_serializer);
+
+       return ;
+}
+
+/*
+ * Change the core class of the given task.
+ */
+
+int 
+ckrm_forced_reclassify_pid(pid_t pid, struct ckrm_task_class *cls)
+{
+       struct task_struct *tsk;
+
+       if (!ckrm_validate_and_grab_core(class_core(cls)))
+               return - EINVAL;
+
+       read_lock(&tasklist_lock);
+       if ((tsk = find_task_by_pid(pid)) == NULL) {
+               read_unlock(&tasklist_lock);
+               ckrm_core_drop(class_core(cls));
+               return -EINVAL;
+       }
+       get_task_struct(tsk);
+       read_unlock(&tasklist_lock);
+       
+       down(&async_serializer);   // protect again race condition
+       
+       ce_protect(&CT_taskclass);
+       ckrm_set_taskclass(tsk, cls, NULL, CKRM_EVENT_MANUAL);
+       ce_release(&CT_taskclass);
+       put_task_struct(tsk);
+       
+       up(&async_serializer);
+       return 0;
+}
+
+static struct ckrm_core_class *
+ckrm_alloc_task_class(struct ckrm_core_class *parent, const char *name)
+{
+       struct ckrm_task_class *taskcls;
+       taskcls = kmalloc(sizeof(struct ckrm_task_class), GFP_KERNEL);
+       if (taskcls == NULL) 
+               return NULL;
+
+       ckrm_init_core_class(&CT_taskclass,
+                            class_core(taskcls),parent,name);
+
+       ce_protect(&CT_taskclass);
+       if (CT_taskclass.ce_cb_active && CT_taskclass.ce_callbacks.class_add)
+               (*CT_taskclass.ce_callbacks.class_add)(name,taskcls);
+       ce_release(&CT_taskclass);
+
+       return class_core(taskcls);
+}
+
+static int
+ckrm_free_task_class(struct ckrm_core_class *core)
+{
+       struct ckrm_task_class *taskcls;
+
+       if (!ckrm_is_core_valid(core)) {
+               // Invalid core
+               return (-EINVAL);
+       }
+       if (core == core->classtype->default_class) {
+               // reset the name tag
+               core->name = dflt_taskclass_name;
+               return 0;
+       }
+
+       TC_DEBUG("%p:%s:%d\n",core,core->name,atomic_read(&core->refcnt));
+
+       taskcls = class_type(struct ckrm_task_class, core);
+
+       ce_protect(&CT_taskclass);
+
+       if (CT_taskclass.ce_cb_active && CT_taskclass.ce_callbacks.class_delete)
+               (*CT_taskclass.ce_callbacks.class_delete)(core->name,taskcls);
+       ckrm_reclassify_class_tasks( taskcls );
+
+       ce_release(&CT_taskclass);
+
+       ckrm_release_core_class(core);  // Hubertus .... could just drop the class .. error message
+       return 0;
+}
+
+
+void __init
+ckrm_meta_init_taskclass(void)
+{
+       printk("...... Initializing ClassType<%s> ........\n",CT_taskclass.name);
+       // intialize the default class
+       ckrm_init_core_class(&CT_taskclass, class_core(&taskclass_dflt_class),
+                            NULL,dflt_taskclass_name);
+
+       // register classtype and initialize default task class
+       ckrm_register_classtype(&CT_taskclass);
+       ckrm_register_event_set(taskclass_events_callbacks);
+
+       // note registeration of all resource controllers will be done later dynamically 
+       // as these are specified as modules
+}
+
+
+
+static int                      
+tc_show_members(struct ckrm_core_class *core, struct seq_file *seq) 
+{
+       struct list_head *lh;
+       struct task_struct *tsk;
+
+       class_lock(core);
+       list_for_each(lh, &core->objlist) {     
+               tsk = container_of(lh, struct task_struct, taskclass_link);
+               seq_printf(seq,"%ld\n", (long)tsk->pid);
+       }
+       class_unlock(core);
+
+       return 0;
+}
+
+static int
+tc_forced_reclassify(struct ckrm_core_class *target,const char *obj)
+{      
+       pid_t pid;
+       int rc = -EINVAL;
+
+       pid = (pid_t) simple_strtoul(obj,NULL,10);
+       if (pid > 0) {
+               rc = ckrm_forced_reclassify_pid(pid,
+                               class_type(ckrm_task_class_t,target));
+       }
+       return rc;
+} 
+       
+#if 1
+
+/***************************************************************************************
+ * Debugging Task Classes:  Utility functions
+ **************************************************************************************/
+
+void
+check_tasklist_sanity(struct ckrm_task_class *cls)
+{
+       struct ckrm_core_class *core = class_core(cls);
+       struct list_head *lh1, *lh2;
+       int count = 0;
+
+       if (core) {
+               class_lock(core);
+               if (list_empty(&core->objlist)) {
+                       class_lock(core);
+                       printk("check_tasklist_sanity: class %s empty list\n",
+                                       core->name);
+                       return;
+               }
+               list_for_each_safe(lh1, lh2, &core->objlist) {
+                       struct task_struct *tsk = container_of(lh1, struct task_struct, taskclass_link);
+                       if (count++ > 20000) {
+                               printk("list is CORRUPTED\n");
+                               break;
+                       }
+                       if (tsk->taskclass != cls) {
+                               const char *tclsname;
+                               tclsname = (tsk->taskclass) ? class_core(tsk->taskclass)->name 
+                                                           : "NULL";
+                               printk("sanity: task %s:%d has ckrm_core |%s| but in list |%s|\n",
+                                      tsk->comm,tsk->pid,tclsname,core->name);
+                       }
+               }
+               class_unlock(core);
+       }
+}
+
+void 
+ckrm_debug_free_task_class(struct ckrm_task_class *tskcls)
+{
+       struct task_struct *proc, *thread;
+       int count = 0;
+
+       printk("Analyze Error <%s> %d\n",
+              class_core(tskcls)->name,atomic_read(&(class_core(tskcls)->refcnt)));
+
+       read_lock(&tasklist_lock);
+       class_lock(class_core(tskcls));
+       do_each_thread(proc, thread) {
+               count += (tskcls == thread->taskclass);
+               if ((thread->taskclass == tskcls) || (tskcls == NULL)) {
+                       const char *tclsname;
+                       tclsname = (thread->taskclass) ? class_core(thread->taskclass)->name : "NULL";
+                       printk("%d thread=<%s:%d>  -> <%s> <%lx>\n",
+                              count,thread->comm,thread->pid,tclsname, thread->flags & PF_EXITING);
+               }
+       } while_each_thread(proc, thread);
+       class_unlock(class_core(tskcls));
+       read_unlock(&tasklist_lock);
+
+       printk("End Analyze Error <%s> %d\n",
+              class_core(tskcls)->name,atomic_read(&(class_core(tskcls)->refcnt)));
+} 
+
+#endif
diff --git a/kernel/ckrm/ckrmutils.c b/kernel/ckrm/ckrmutils.c
new file mode 100644 (file)
index 0000000..c0d873c
--- /dev/null
@@ -0,0 +1,207 @@
+/* ckrmutils.c - Utility functions for CKRM
+ *
+ * Copyright (C) Chandra Seetharaman,  IBM Corp. 2003
+ *           (C) Hubertus Franke    ,  IBM Corp. 2004
+ * 
+ * Provides simple utility functions for the core module, CE and resource
+ * controllers.
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Changes
+ * 
+ * 13 Nov 2003
+ *        Created
+ */
+
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/mount.h>
+#include <linux/module.h>
+#include <linux/ckrm_rc.h>
+
+int
+get_exe_path_name(struct task_struct *tsk, char *buf, int buflen)
+{
+       struct vm_area_struct * vma;
+       struct vfsmount *mnt;
+       struct mm_struct * mm = get_task_mm(tsk);
+       struct dentry *dentry;
+       char *lname;
+       int rc = 0;
+
+       *buf = '\0';
+       if (!mm) {
+               return -EINVAL;
+       }
+
+       down_read(&mm->mmap_sem);
+       vma = mm->mmap;
+       while (vma) {
+               if ((vma->vm_flags & VM_EXECUTABLE) &&
+                               vma->vm_file) {
+                       dentry = dget(vma->vm_file->f_dentry);
+                       mnt = mntget(vma->vm_file->f_vfsmnt);
+                       lname = d_path(dentry, mnt, buf, buflen);
+                       if (! IS_ERR(lname)) {
+                               strncpy(buf, lname, strlen(lname) + 1);
+                       } else {
+                               rc = (int) PTR_ERR(lname);
+                       }
+                       mntput(mnt);
+                       dput(dentry);
+                       break;
+               }
+               vma = vma->vm_next;
+       }
+       up_read(&mm->mmap_sem);
+       mmput(mm);
+       return rc;
+}
+
+
+/*
+ * must be called with cnt_lock of parres held
+ * Caller is responsible for making sure that the new guarantee doesn't
+ * overflow parent's total guarantee.
+ */
+void
+child_guarantee_changed(struct ckrm_shares *parent, int cur, int new)
+{
+       if (new == cur || !parent) {
+               return;
+       }
+       if (new != CKRM_SHARE_DONTCARE) {
+               parent->unused_guarantee -= new;
+       }
+       if (cur != CKRM_SHARE_DONTCARE) {
+               parent->unused_guarantee += cur;
+       }
+       return;
+}
+
+/*
+ * must be called with cnt_lock of parres held
+ * Caller is responsible for making sure that the new limit is not more 
+ * than parent's max_limit
+ */
+void
+child_maxlimit_changed(struct ckrm_shares *parent, int new_limit)
+{
+       if (parent && parent->cur_max_limit < new_limit) {
+               parent->cur_max_limit = new_limit;
+       }
+       return;
+}
+
+/*
+ * Caller is responsible for holding any lock to protect the data
+ * structures passed to this function
+ */
+int
+set_shares(struct ckrm_shares *new, struct ckrm_shares *cur,
+               struct ckrm_shares *par)
+{
+       int rc = -EINVAL;
+       int cur_usage_guar = cur->total_guarantee - cur->unused_guarantee;
+       int increase_by = new->my_guarantee - cur->my_guarantee;
+
+       // Check total_guarantee for correctness
+       if (new->total_guarantee <= CKRM_SHARE_DONTCARE) {
+               goto set_share_err;
+       } else if (new->total_guarantee == CKRM_SHARE_UNCHANGED) {
+               ;// do nothing
+       } else if (cur_usage_guar > new->total_guarantee) {
+               goto set_share_err;
+       }
+
+       // Check max_limit for correctness
+       if (new->max_limit <= CKRM_SHARE_DONTCARE) {
+               goto set_share_err;
+       } else if (new->max_limit == CKRM_SHARE_UNCHANGED) {
+               ; // do nothing
+       } else if (cur->cur_max_limit > new->max_limit) {
+               goto set_share_err;
+       }
+
+       // Check my_guarantee for correctness
+       if (new->my_guarantee == CKRM_SHARE_UNCHANGED) {
+               ; // do nothing
+       } else if (new->my_guarantee == CKRM_SHARE_DONTCARE) {
+               ; // do nothing
+       } else if (par && increase_by > par->unused_guarantee) {
+               goto set_share_err;
+       }
+
+       // Check my_limit for correctness
+       if (new->my_limit == CKRM_SHARE_UNCHANGED) {
+               ; // do nothing
+       } else if (new->my_limit == CKRM_SHARE_DONTCARE) {
+               ; // do nothing
+       } else if (par && new->my_limit > par->max_limit) {
+               // I can't get more limit than my parent's limit
+               goto set_share_err;
+               
+       }
+
+       // make sure guarantee is lesser than limit
+       if (new->my_limit == CKRM_SHARE_DONTCARE) {
+               ; // do nothing
+       } else if (new->my_limit == CKRM_SHARE_UNCHANGED) {
+               if (new->my_guarantee == CKRM_SHARE_DONTCARE) {
+                       ; // do nothing
+               } else if (new->my_guarantee == CKRM_SHARE_UNCHANGED) {
+                       ; // do nothing earlier setting would 've taken care of it
+               } else if (new->my_guarantee > cur->my_limit) {
+                       goto set_share_err;
+               }
+       } else { // new->my_limit has a valid value
+               if (new->my_guarantee == CKRM_SHARE_DONTCARE) {
+                       ; // do nothing
+               } else if (new->my_guarantee == CKRM_SHARE_UNCHANGED) {
+                       if (cur->my_guarantee > new->my_limit) {
+                               goto set_share_err;
+                       }
+               } else if (new->my_guarantee > new->my_limit) {
+                       goto set_share_err;
+               }
+       }
+
+       if (new->my_guarantee != CKRM_SHARE_UNCHANGED) {
+               child_guarantee_changed(par, cur->my_guarantee,
+                               new->my_guarantee);
+               cur->my_guarantee = new->my_guarantee;
+       }
+
+       if (new->my_limit != CKRM_SHARE_UNCHANGED) {
+               child_maxlimit_changed(par, new->my_limit);
+               cur->my_limit = new->my_limit;
+       }
+
+       if (new->total_guarantee != CKRM_SHARE_UNCHANGED) {
+               cur->unused_guarantee = new->total_guarantee - cur_usage_guar;
+               cur->total_guarantee = new->total_guarantee;
+       }
+
+       if (new->max_limit != CKRM_SHARE_UNCHANGED) {
+               cur->max_limit = new->max_limit;
+       }
+
+       rc = 0;
+set_share_err:
+       return rc;
+}
+
+EXPORT_SYMBOL(get_exe_path_name);
+EXPORT_SYMBOL(child_guarantee_changed);
+EXPORT_SYMBOL(child_maxlimit_changed);
+EXPORT_SYMBOL(set_shares);
+
+