To compile this as a module, choose M here: the module will be called
ramfs.
+config RELAYFS_FS
+ tristate "Relayfs file system support"
+ ---help---
+ Relayfs is a high-speed data relay filesystem designed to provide
+ an efficient mechanism for tools and facilities to relay large
+ amounts of data from kernel space to user space. It's not useful
+ on its own, and should only be enabled if other facilities that
+ need it are enabled, such as for example klog or the Linux Trace
+ Toolkit.
+
+ See <file:Documentation/filesystems/relayfs.txt> for further
+ information.
+
+ This file system is also available as a module ( = code which can be
+ inserted in and removed from the running kernel whenever you want).
+ The module is called relayfs. If you want to compile it as a
+ module, say M here and read <file:Documentation/modules.txt>.
+
+ If unsure, say N.
+
+config KLOG_CHANNEL
+ bool "Enable klog debugging support"
+ depends on RELAYFS_FS
+ help
+ If you say Y to this, a relayfs channel named klog will be created
+ in the root of the relayfs file system. You can write to the klog
+ channel using klog() or klog_raw() from within the kernel or
+ kernel modules, and read from the klog channel by mounting relayfs
+ and using read(2) to read from it (or using cat). If you're not
+ sure, say N.
+
+config KLOG_CHANNEL_AUTOENABLE
+ bool "Enable klog logging on startup"
+ depends on KLOG_CHANNEL
+ default y
+ help
+ If you say Y to this, the klog channel will be automatically enabled
+ on startup. Otherwise, to turn klog logging on, you need use
+ sysctl (fs.relayfs.klog_enabled). This option is used in cases where
+ you don't actually want the channel to be written to until it's
+ enabled. If you're not sure, say Y.
+
+config KLOG_CHANNEL_SHIFT
+ depends on KLOG_CHANNEL
+ int "klog debugging channel size (14 => 16KB, 22 => 4MB)"
+ range 14 22
+ default 21
+ help
+ Select klog debugging channel size as a power of 2.
+
endmenu
menu "Miscellaneous filesystems"
To compile this file system support as a module, choose M here: the
module will be called hpfs. If unsure, say N.
-
-
config QNX4FS_FS
tristate "QNX4 file system support (read only)"
help
It's currently broken, so for now:
answer N.
-
-
config SYSV_FS
tristate "System V/Xenix/V7/Coherent file system support"
help
If you haven't heard about all of this before, it's safe to say N.
-
-
config UFS_FS
tristate "UFS file system support (read only)"
help
obj-$(CONFIG_CRAMFS) += cramfs/
obj-$(CONFIG_RAMFS) += ramfs/
obj-$(CONFIG_HUGETLBFS) += hugetlbfs/
+obj-$(CONFIG_RELAYFS_FS) += relayfs/
obj-$(CONFIG_CODA_FS) += coda/
obj-$(CONFIG_INTERMEZZO_FS) += intermezzo/
obj-$(CONFIG_MINIX_FS) += minix/
obj-$(CONFIG_XFS_FS) += xfs/
obj-$(CONFIG_AFS_FS) += afs/
obj-$(CONFIG_BEFS_FS) += befs/
+obj-$(CONFIG_RCFS_FS) += rcfs/
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/rmap.h>
+#include <linux/ckrm.h>
#include <asm/uaccess.h>
#include <asm/pgalloc.h>
if (retval >= 0) {
free_arg_pages(&bprm);
+ ckrm_cb_exec(filename);
+
/* execve success */
security_bprm_free(&bprm);
return retval;
return sprintf(buffer,"%d %d %d %d %d %d %d\n",
size, resident, shared, text, lib, data, 0);
}
+
+
+int proc_pid_delay(struct task_struct *task, char * buffer)
+{
+ int res;
+
+ res = sprintf(buffer,"%lu %lu %lu %lu %lu %lu %lu\n",
+ get_delay(task,runs),
+ get_delay(task,runcpu_total),
+ get_delay(task,waitcpu_total),
+ get_delay(task,iowait_total),
+ get_delay(task,num_iowaits),
+ get_delay(task,mem_iowait_total),
+ get_delay(task,num_memwaits)
+
+ );
+ return res;
+}
+
PROC_TID_ATTR_PREV,
PROC_TID_ATTR_EXEC,
PROC_TID_ATTR_FSCREATE,
+#endif
+#ifdef CONFIG_DELAY_ACCT
+ PROC_TID_DELAY_ACCT,
+ PROC_TGID_DELAY_ACCT,
#endif
PROC_TID_FD_DIR = 0x8000, /* 0x8000-0xffff */
};
#ifdef CONFIG_SECURITY
E(PROC_TGID_ATTR, "attr", S_IFDIR|S_IRUGO|S_IXUGO),
#endif
+#ifdef CONFIG_DELAY_ACCT
+ E(PROC_TGID_DELAY_ACCT,"delay", S_IFREG|S_IRUGO),
+#endif
#ifdef CONFIG_KALLSYMS
E(PROC_TGID_WCHAN, "wchan", S_IFREG|S_IRUGO),
#endif
#ifdef CONFIG_SECURITY
E(PROC_TID_ATTR, "attr", S_IFDIR|S_IRUGO|S_IXUGO),
#endif
+#ifdef CONFIG_DELAY_ACCT
+ E(PROC_TGID_DELAY_ACCT,"delay", S_IFREG|S_IRUGO),
+#endif
#ifdef CONFIG_KALLSYMS
E(PROC_TID_WCHAN, "wchan", S_IFREG|S_IRUGO),
#endif
int proc_pid_status(struct task_struct*,char*);
int proc_pid_statm(struct task_struct*,char*);
int proc_pid_cpu(struct task_struct*,char*);
+int proc_pid_delay(struct task_struct*,char*);
static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
{
inode->i_fop = &proc_info_file_operations;
ei->op.proc_read = proc_pid_wchan;
break;
+#endif
+#ifdef CONFIG_DELAY_ACCT
+ case PROC_TID_DELAY_ACCT:
+ case PROC_TGID_DELAY_ACCT:
+ inode->i_fop = &proc_info_file_operations;
+ ei->op.proc_read = proc_pid_delay;
+ break;
#endif
default:
printk("procfs: impossible type (%d)",p->type);
#ifdef __KERNEL__
+#include <linux/taskdelays.h>
#include <linux/spinlock.h>
/*
unsigned long ptrace_message;
siginfo_t *last_siginfo; /* For ptrace use. */
+
+#ifdef CONFIG_CKRM
+ spinlock_t ckrm_tsklock;
+ void *ce_data;
+#ifdef CONFIG_CKRM_TYPE_TASKCLASS
+ // .. Hubertus should change to CONFIG_CKRM_TYPE_TASKCLASS
+ struct ckrm_task_class *taskclass;
+ struct list_head taskclass_link;
+#endif // CONFIG_CKRM_TYPE_TASKCLASS
+#endif // CONFIG_CKRM
+
+ struct task_delay_info delays;
};
static inline pid_t process_group(struct task_struct *tsk)
#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */
#define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */
+#define PF_MEMIO 0x00400000 /* I am potentially doing I/O for mem */
+#define PF_IOWAIT 0x00800000 /* I am waiting on disk I/O */
+
#ifdef CONFIG_SMP
extern int set_cpus_allowed(task_t *p, cpumask_t new_mask);
#else
#endif /* CONFIG_SMP */
+
+/* API for registering delay info */
+#ifdef CONFIG_DELAY_ACCT
+
+#define test_delay_flag(tsk,flg) ((tsk)->flags & (flg))
+#define set_delay_flag(tsk,flg) ((tsk)->flags |= (flg))
+#define clear_delay_flag(tsk,flg) ((tsk)->flags &= ~(flg))
+
+#define def_delay_var(var) unsigned long long var
+#define get_delay(tsk,field) ((tsk)->delays.field)
+#define delay_value(x) (((unsigned long)(x))/1000)
+
+#define start_delay(var) ((var) = sched_clock())
+#define start_delay_set(var,flg) (set_delay_flag(current,flg),(var) = sched_clock())
+
+#define inc_delay(tsk,field) (((tsk)->delays.field)++)
+#define add_delay_ts(tsk,field,start_ts,end_ts) ((tsk)->delays.field += delay_value((end_ts)-(start_ts)))
+#define add_delay_clear(tsk,field,start_ts,flg) (add_delay_ts(tsk,field,start_ts,sched_clock()),clear_delay_flag(tsk,flg))
+
+static inline void add_io_delay(unsigned long dstart)
+{
+ struct task_struct * tsk = current;
+ unsigned long val = delay_value(sched_clock()-dstart);
+ if (test_delay_flag(tsk,PF_MEMIO)) {
+ tsk->delays.mem_iowait_total += val;
+ tsk->delays.num_memwaits++;
+ } else {
+ tsk->delays.iowait_total += val;
+ tsk->delays.num_iowaits++;
+ }
+ clear_delay_flag(tsk,PF_IOWAIT);
+}
+
+
+#else
+
+#define test_delay_flag(tsk,flg) (0)
+#define set_delay_flag(tsk,flg) do { } while (0)
+#define clear_delay_flag(tsk,flg) do { } while (0)
+
+#define def_delay_var(var)
+#define get_delay(tsk,field) (0)
+
+#define start_delay(var) do { } while (0)
+#define start_delay_set(var,flg) do { } while (0)
+
+#define inc_delay(tsk,field) do { } while (0)
+#define add_delay_ts(tsk,field,start_ts,now) do { } while (0)
+#define add_delay_clear(tsk,field,start_ts,flg) do { } while (0)
+#define add_io_delay(dstart) do { } while (0)
+#endif
+
+
+
#endif /* __KERNEL__ */
#endif
#define TCP_INFO 11 /* Information about this connection. */
#define TCP_QUICKACK 12 /* Block/reenable quick acks */
+#ifdef CONFIG_ACCEPT_QUEUES
+#define TCP_ACCEPTQ_SHARE 13 /* Set accept queue share */
+#endif
+
#define TCPI_OPT_TIMESTAMPS 1
#define TCPI_OPT_SACK 2
#define TCPI_OPT_WSCALE 4
__u32 tcpi_reordering;
};
+#ifdef CONFIG_ACCEPT_QUEUES
+
+#define NUM_ACCEPT_QUEUES 8 /* Must be power of 2 */
+
+struct tcp_acceptq_info {
+ unsigned char acceptq_shares;
+ unsigned long acceptq_wait_time;
+ unsigned int acceptq_qcount;
+ unsigned int acceptq_count;
+};
+#endif
+
#ifdef __KERNEL__
#include <linux/config.h>
/* FIFO of established children */
struct open_request *accept_queue;
- struct open_request *accept_queue_tail;
-
+#ifndef CONFIG_ACCEPT_QUEUES
+ struct open_request *accept_queue_tail;
+#endif
int write_pending; /* A write to socket waits to start. */
unsigned int keepalive_time; /* time before keep alive takes place */
__u32 last_max_cwnd; /* last maximium snd_cwnd */
__u32 last_cwnd; /* the last snd_cwnd */
} bictcp;
+
+#ifdef CONFIG_ACCEPT_QUEUES
+ /* move to listen opt... */
+ char class_index;
+ struct {
+ struct open_request *aq_head;
+ struct open_request *aq_tail;
+ unsigned int aq_cnt;
+ unsigned int aq_ratio;
+ unsigned int aq_count;
+ unsigned int aq_qcount;
+ unsigned int aq_backlog;
+ unsigned int aq_wait_time;
+ int aq_valid;
+ } acceptq[NUM_ACCEPT_QUEUES];
+#endif
};
/* WARNING: don't change the layout of the members in tcp_sock! */
struct timeval sk_stamp;
struct socket *sk_socket;
void *sk_user_data;
+ void *sk_ns; // For use by CKRM
struct module *sk_owner;
void *sk_security;
void (*sk_state_change)(struct sock *sk);
struct tcp_v6_open_req v6_req;
#endif
} af;
+#ifdef CONFIG_ACCEPT_QUEUES
+ unsigned long acceptq_time_stamp;
+ int acceptq_class;
+#endif
};
/* SLAB cache for open requests. */
return tcp_win_from_space(sk->sk_rcvbuf);
}
+#ifdef CONFIG_ACCEPT_QUEUES
+static inline void tcp_acceptq_removed(struct sock *sk, int class)
+{
+ tcp_sk(sk)->acceptq[class].aq_backlog--;
+}
+
+static inline void tcp_acceptq_added(struct sock *sk, int class)
+{
+ tcp_sk(sk)->acceptq[class].aq_backlog++;
+}
+
+static inline int tcp_acceptq_is_full(struct sock *sk, int class)
+{
+ return tcp_sk(sk)->acceptq[class].aq_backlog >
+ sk->sk_max_ack_backlog;
+}
+
+static inline void tcp_set_acceptq(struct tcp_opt *tp, struct open_request *req)
+{
+ int class = req->acceptq_class;
+ int prev_class;
+
+ if (!tp->acceptq[class].aq_ratio) {
+ req->acceptq_class = 0;
+ class = 0;
+ }
+
+ tp->acceptq[class].aq_qcount++;
+ req->acceptq_time_stamp = jiffies;
+
+ if (tp->acceptq[class].aq_tail) {
+ req->dl_next = tp->acceptq[class].aq_tail->dl_next;
+ tp->acceptq[class].aq_tail->dl_next = req;
+ tp->acceptq[class].aq_tail = req;
+ } else { /* if first request in the class */
+ tp->acceptq[class].aq_head = req;
+ tp->acceptq[class].aq_tail = req;
+
+ prev_class = class - 1;
+ while (prev_class >= 0) {
+ if (tp->acceptq[prev_class].aq_tail)
+ break;
+ prev_class--;
+ }
+ if (prev_class < 0) {
+ req->dl_next = tp->accept_queue;
+ tp->accept_queue = req;
+ }
+ else {
+ req->dl_next = tp->acceptq[prev_class].aq_tail->dl_next;
+ tp->acceptq[prev_class].aq_tail->dl_next = req;
+ }
+ }
+}
+static inline void tcp_acceptq_queue(struct sock *sk, struct open_request *req,
+ struct sock *child)
+{
+ tcp_set_acceptq(tcp_sk(sk),req);
+ req->sk = child;
+ tcp_acceptq_added(sk,req->acceptq_class);
+}
+
+#else
static inline void tcp_acceptq_removed(struct sock *sk)
{
sk->sk_ack_backlog--;
req->dl_next = NULL;
}
+#endif
+
struct tcp_listen_opt
{
u8 max_qlen_log; /* log_2 of maximal queued SYNs */
int qlen;
+#ifdef CONFIG_ACCEPT_QUEUES
+ int qlen_young[NUM_ACCEPT_QUEUES];
+#else
int qlen_young;
+#endif
int clock_hand;
u32 hash_rnd;
struct open_request *syn_table[TCP_SYNQ_HSIZE];
};
+#ifdef CONFIG_ACCEPT_QUEUES
+static inline void
+tcp_synq_removed(struct sock *sk, struct open_request *req)
+{
+ struct tcp_listen_opt *lopt = tcp_sk(sk)->listen_opt;
+
+ if (--lopt->qlen == 0)
+ tcp_delete_keepalive_timer(sk);
+ if (req->retrans == 0)
+ lopt->qlen_young[req->acceptq_class]--;
+}
+
+static inline void tcp_synq_added(struct sock *sk, struct open_request *req)
+{
+ struct tcp_listen_opt *lopt = tcp_sk(sk)->listen_opt;
+
+ if (lopt->qlen++ == 0)
+ tcp_reset_keepalive_timer(sk, TCP_TIMEOUT_INIT);
+ lopt->qlen_young[req->acceptq_class]++;
+}
+
+static inline int tcp_synq_len(struct sock *sk)
+{
+ return tcp_sk(sk)->listen_opt->qlen;
+}
+
+static inline int tcp_synq_young(struct sock *sk, int class)
+{
+ return tcp_sk(sk)->listen_opt->qlen_young[class];
+}
+
+#else
+
static inline void
tcp_synq_removed(struct sock *sk, struct open_request *req)
{
{
return tcp_sk(sk)->listen_opt->qlen_young;
}
+#endif
static inline int tcp_synq_is_full(struct sock *sk)
{
up to the user level program to do useful things with this
information. This is generally a good idea, so say Y.
+menu "Class Based Kernel Resource Management"
+
+config CKRM
+ bool "Class Based Kernel Resource Management Core"
+ depends on EXPERIMENTAL
+ help
+ Class-based Kernel Resource Management is a framework for controlling
+ and monitoring resource allocation of user-defined groups of tasks or
+ incoming socket connections. For more information, please visit
+ http://ckrm.sf.net.
+
+ If you say Y here, enable the Resource Class File System and atleast
+ one of the resource controllers below. Say N if you are unsure.
+
+config RCFS_FS
+ tristate "Resource Class File System (User API)"
+ depends on CKRM
+ help
+ RCFS is the filesystem API for CKRM. This separate configuration
+ option is provided only for debugging and will eventually disappear
+ since rcfs will be automounted whenever CKRM is configured.
+
+ Say N if unsure, Y if you've enabled CKRM, M to debug rcfs
+ initialization.
+
+config CKRM_TYPE_TASKCLASS
+ bool "Class Manager for Task Groups"
+ depends on CKRM
+ help
+ TASKCLASS provides the extensions for CKRM to track task classes
+ This is the base to enable task class based resource control for
+ cpu, memory and disk I/O.
+
+ Say N if unsure
+
+config CKRM_RES_NUMTASKS
+ tristate "Number of Tasks Resource Manager"
+ depends on CKRM_TYPE_TASKCLASS
+ default m
+ help
+ Provides a Resource Controller for CKRM that allows limiting no of
+ tasks a task class can have.
+
+ Say N if unsure, Y to use the feature.
+
+config CKRM_TYPE_SOCKETCLASS
+ bool "Class Manager for socket groups"
+ depends on CKRM
+ help
+ SOCKET provides the extensions for CKRM to track per socket
+ classes. This is the base to enable socket based resource
+ control for inbound connection control, bandwidth control etc.
+
+ Say N if unsure.
+
+config CKRM_RES_LISTENAQ
+ tristate "Multiple Accept Queues Resource Manager"
+ depends on CKRM_TYPE_SOCKETCLASS && ACCEPT_QUEUES
+ default m
+ help
+ Provides a resource controller for CKRM to prioritize inbound
+ connection requests. See inbound control description for
+ "IP: TCP Multiple accept queues support". If you choose that
+ option choose this option to control the queue weights.
+
+ If unsure, say N.
+
+endmenu
+
config SYSCTL
bool "Sysctl support"
---help---
environments which can tolerate a "non-standard" kernel.
Only use this if you really know what you are doing.
+config DELAY_ACCT
+ bool "Enable delay accounting (EXPERIMENTAL)"
+ help
+ In addition to counting frequency the total delay in ns is also
+ recorded. CPU delays are specified as cpu-wait and cpu-run. Memory
+ delay is recorded for minor and major faults. Information is
+ accessible through /proc/<pid>/delay.
+
+
config KALLSYMS
bool "Load all symbols for debugging/kksymoops" if EMBEDDED
default y
#include <asm/io.h>
#include <asm/bugs.h>
+#include <linux/ckrm.h>
+
/*
* This is one of the first .c files built. Error out early
* if we have compiler trouble..
rcu_init();
init_IRQ();
pidhash_init();
+ ckrm_init();
sched_init();
softirq_init();
time_init();
#ifdef CONFIG_PROC_FS
proc_root_init();
#endif
+
check_bugs();
printk("POSIX conformance testing by UNIFIX\n");
sysctl.o capability.o ptrace.o timer.o user.o \
signal.o sys.o kmod.o workqueue.o pid.o \
rcupdate.o intermodule.o extable.o params.o posix-timers.o \
- kthread.o
+ kthread.o ckrm/
obj-$(CONFIG_FUTEX) += futex.o
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
#include <linux/profile.h>
#include <linux/mount.h>
#include <linux/proc_fs.h>
+#include <linux/ckrm.h>
+#include <linux/ckrm_tsk.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
int state;
struct task_struct *t;
+ ckrm_cb_exit(tsk);
+
if (signal_pending(tsk) && !tsk->signal->group_exit
&& !thread_group_empty(tsk)) {
/*
module_put(tsk->binfmt->module);
tsk->exit_code = code;
+ numtasks_put_ref(tsk->taskclass);
exit_notify(tsk);
schedule();
BUG();
#include <linux/ptrace.h>
#include <linux/mount.h>
#include <linux/audit.h>
+#include <linux/ckrm.h>
+#include <linux/ckrm_tsk.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
tsk->thread_info = ti;
ti->task = tsk;
+ ckrm_cb_newtask(tsk);
/* One for us, one for whoever does the "release_task()" (usually parent) */
atomic_set(&tsk->usage,2);
return tsk;
clone_flags |= CLONE_PTRACE;
}
+ if (numtasks_get_ref(current->taskclass, 0) == 0) {
+ return -ENOMEM;
+ }
+
p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr);
/*
* Do this prior waking up the new thread - the thread pointer
if (!IS_ERR(p)) {
struct completion vfork;
+ ckrm_cb_fork(p);
+
if (clone_flags & CLONE_VFORK) {
p->vfork_done = &vfork;
init_completion(&vfork);
* COW overhead when the child exec()s afterwards.
*/
set_need_resched();
+ } else {
+ numtasks_put_ref(current->taskclass);
}
return pid;
}
if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev)))
prev->interactive_credit--;
}
+ add_delay_ts(prev,runcpu_total,prev->timestamp,now);
prev->timestamp = now;
if (likely(prev != next)) {
+ add_delay_ts(next,waitcpu_total,next->timestamp,now);
+ inc_delay(next,runs);
next->timestamp = now;
rq->nr_switches++;
rq->curr = next;
void __sched io_schedule(void)
{
struct runqueue *rq = this_rq();
+ def_delay_var(dstart);
+ start_delay_set(dstart,PF_IOWAIT);
atomic_inc(&rq->nr_iowait);
schedule();
atomic_dec(&rq->nr_iowait);
+ add_io_delay(dstart);
}
EXPORT_SYMBOL(io_schedule);
{
struct runqueue *rq = this_rq();
long ret;
+ def_delay_var(dstart);
+ start_delay_set(dstart,PF_IOWAIT);
atomic_inc(&rq->nr_iowait);
ret = schedule_timeout(timeout);
atomic_dec(&rq->nr_iowait);
+ add_io_delay(dstart);
return ret;
}
EXPORT_SYMBOL(__preempt_write_lock);
#endif /* defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) */
+
+#ifdef CONFIG_DELAY_ACCT
+int task_running_sys(struct task_struct *p)
+{
+ return task_running(task_rq(p),p);
+}
+EXPORT_SYMBOL(task_running_sys);
+#endif
+
#include <linux/security.h>
#include <linux/dcookies.h>
#include <linux/suspend.h>
+#include <linux/ckrm.h>
#include <asm/uaccess.h>
#include <asm/io.h>
current->fsgid = new_egid;
current->egid = new_egid;
current->gid = new_rgid;
+
+ ckrm_cb_gid();
+
return 0;
}
}
else
return -EPERM;
+
+ ckrm_cb_gid();
+
return 0;
}
current->suid = current->euid;
current->fsuid = current->euid;
+ ckrm_cb_uid();
+
return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE);
}
current->fsuid = current->euid = uid;
current->suid = new_suid;
+ ckrm_cb_uid();
+
return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID);
}
if (suid != (uid_t) -1)
current->suid = suid;
+ ckrm_cb_uid();
+
return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES);
}
current->gid = rgid;
if (sgid != (gid_t) -1)
current->sgid = sgid;
+
+ ckrm_cb_gid();
+
return 0;
}
* We need the page table lock to synchronize with kswapd
* and the SMP-safe atomic PTE updates.
*/
+ set_delay_flag(current,PF_MEMIO);
spin_lock(&mm->page_table_lock);
pmd = pmd_alloc(mm, pgd, address);
if (pmd) {
pte_t * pte = pte_alloc_map(mm, pmd, address);
- if (pte)
- return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
+ if (pte) {
+ int rc = handle_pte_fault(mm, vma, address, write_access, pte, pmd);
+ clear_delay_flag(current,PF_MEMIO);
+ return rc;
+ }
}
spin_unlock(&mm->page_table_lock);
+ clear_delay_flag(current,PF_MEMIO);
return VM_FAULT_OOM;
}
If unsure, say Y.
+config ACCEPT_QUEUES
+ bool "IP: TCP Multiple accept queues support"
+ depends on INET && NETFILTER
+ ---help---
+ Support multiple accept queues per listening socket. If you say Y
+ here, multiple accept queues will be configured per listening
+ socket.
+
+ Each queue is mapped to a priority class. Incoming connection
+ requests can be classified (see iptables(8), MARK target), depending
+ on the packet's src/dest address or other parameters, into one of
+ the priority classes. The requests are then queued to the relevant
+ accept queue.
+
+ Each of the queues can be assigned a weight. The accept()ance
+ of packets is then scheduled in accordance with the weight
+ assigned to the priority class.
+
+ Be sure to enable "Network packet filtering" if you wish
+ to use this feature.
+
+ If unsure, say N.
+
source "net/ipv4/ipvs/Kconfig"
#include <linux/smp_lock.h>
#include <linux/fs.h>
#include <linux/random.h>
+#include <linux/ckrm.h>
#include <net/icmp.h>
#include <net/tcp.h>
int tcp_listen_start(struct sock *sk)
{
+#ifdef CONFIG_ACCEPT_QUEUES
+ int i = 0;
+#endif
struct inet_opt *inet = inet_sk(sk);
struct tcp_opt *tp = tcp_sk(sk);
struct tcp_listen_opt *lopt;
sk->sk_max_ack_backlog = 0;
sk->sk_ack_backlog = 0;
- tp->accept_queue = tp->accept_queue_tail = NULL;
+ tp->accept_queue = NULL;
+#ifdef CONFIG_ACCEPT_QUEUES
+ tp->class_index = 0;
+ for (i=0; i < NUM_ACCEPT_QUEUES; i++) {
+ tp->acceptq[i].aq_tail = NULL;
+ tp->acceptq[i].aq_head = NULL;
+ tp->acceptq[i].aq_wait_time = 0;
+ tp->acceptq[i].aq_qcount = 0;
+ tp->acceptq[i].aq_count = 0;
+ if (i == 0) {
+ tp->acceptq[i].aq_valid = 1;
+ tp->acceptq[i].aq_ratio = 1;
+ }
+ else {
+ tp->acceptq[i].aq_valid = 0;
+ tp->acceptq[i].aq_ratio = 0;
+ }
+ }
+#endif
tp->syn_wait_lock = RW_LOCK_UNLOCKED;
tcp_delack_init(tp);
sk_dst_reset(sk);
sk->sk_prot->hash(sk);
+#ifdef CONFIG_CKRM
+ ckrm_cb_listen_start(sk);
+#endif
+
return 0;
}
write_lock_bh(&tp->syn_wait_lock);
tp->listen_opt = NULL;
write_unlock_bh(&tp->syn_wait_lock);
- tp->accept_queue = tp->accept_queue_tail = NULL;
+
+#ifdef CONFIG_CKRM
+ ckrm_cb_listen_stop(sk);
+#endif
+
+#ifdef CONFIG_ACCEPT_QUEUES
+ for (i = 0; i < NUM_ACCEPT_QUEUES; i++)
+ tp->acceptq[i].aq_head = tp->acceptq[i].aq_tail = NULL;
+#else
+ tp->accept_queue_tail = NULL;
+#endif
+ tp->accept_queue = NULL;
if (lopt->qlen) {
for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
local_bh_enable();
sock_put(child);
+#ifdef CONFIG_ACCEPT_QUEUES
+ tcp_acceptq_removed(sk, req->acceptq_class);
+#else
tcp_acceptq_removed(sk);
+#endif
tcp_openreq_fastfree(req);
}
BUG_TRAP(!sk->sk_ack_backlog);
struct open_request *req;
struct sock *newsk;
int error;
+#ifdef CONFIG_ACCEPT_QUEUES
+ int prev_class = 0;
+ int first;
+#endif
lock_sock(sk);
/* Find already established connection */
if (!tp->accept_queue) {
long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
-
/* If this is a non blocking socket don't sleep */
error = -EAGAIN;
if (!timeo)
goto out;
}
+#ifndef CONFIG_ACCEPT_QUEUES
req = tp->accept_queue;
if ((tp->accept_queue = req->dl_next) == NULL)
tp->accept_queue_tail = NULL;
- newsk = req->sk;
tcp_acceptq_removed(sk);
+#else
+ first = tp->class_index;
+ /* We should always have request queued here. The accept_queue
+ * is already checked for NULL above.
+ */
+ while(!tp->acceptq[first].aq_head) {
+ tp->acceptq[first].aq_cnt = 0;
+ first = (first+1) & ~NUM_ACCEPT_QUEUES;
+ }
+ req = tp->acceptq[first].aq_head;
+ tp->acceptq[first].aq_qcount--;
+ tp->acceptq[first].aq_count++;
+ tp->acceptq[first].aq_wait_time+=(jiffies - req->acceptq_time_stamp);
+
+ for (prev_class= first-1 ; prev_class >=0; prev_class--)
+ if (tp->acceptq[prev_class].aq_tail)
+ break;
+ if (prev_class>=0)
+ tp->acceptq[prev_class].aq_tail->dl_next = req->dl_next;
+ else
+ tp->accept_queue = req->dl_next;
+
+ if (req == tp->acceptq[first].aq_tail)
+ tp->acceptq[first].aq_head = tp->acceptq[first].aq_tail = NULL;
+ else
+ tp->acceptq[first].aq_head = req->dl_next;
+
+ if((++(tp->acceptq[first].aq_cnt)) >= tp->acceptq[first].aq_ratio){
+ tp->acceptq[first].aq_cnt = 0;
+ tp->class_index = ++first & ~NUM_ACCEPT_QUEUES;
+ }
+ tcp_acceptq_removed(sk, req->acceptq_class);
+#endif
+ newsk = req->sk;
tcp_openreq_fastfree(req);
BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
release_sock(sk);
}
}
break;
+
+#ifdef CONFIG_ACCEPT_QUEUES
+ case TCP_ACCEPTQ_SHARE:
+ {
+ char share_wt[NUM_ACCEPT_QUEUES];
+ int i,j;
+
+ if (sk->sk_state != TCP_LISTEN)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(share_wt,optval, optlen)) {
+ err = -EFAULT;
+ break;
+ }
+ j = 0;
+ for (i = 0; i < NUM_ACCEPT_QUEUES; i++) {
+ if (share_wt[i]) {
+ if (!j)
+ j = share_wt[i];
+ else if (share_wt[i] < j) {
+ j = share_wt[i];
+ }
+ tp->acceptq[i].aq_valid = 1;
+ }
+ else
+ tp->acceptq[i].aq_valid = 0;
+
+ }
+ if (j == 0) {
+ /* Class 0 is always valid. If nothing is
+ * specified set class 0 as 1.
+ */
+ share_wt[0] = 1;
+ tp->acceptq[0].aq_valid = 1;
+ j = 1;
+ }
+ for (i=0; i < NUM_ACCEPT_QUEUES; i++) {
+ tp->acceptq[i].aq_ratio = share_wt[i]/j;
+ tp->acceptq[i].aq_cnt = 0;
+ }
+ }
+ break;
+#endif
default:
err = -ENOPROTOOPT;
case TCP_QUICKACK:
val = !tp->ack.pingpong;
break;
+
+#ifdef CONFIG_ACCEPT_QUEUES
+ case TCP_ACCEPTQ_SHARE: {
+ struct tcp_acceptq_info tinfo[NUM_ACCEPT_QUEUES];
+ int i;
+
+ if (sk->sk_state != TCP_LISTEN)
+ return -EOPNOTSUPP;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ memset(tinfo, 0, sizeof(tinfo));
+
+ for(i=0; i < NUM_ACCEPT_QUEUES; i++) {
+ tinfo[i].acceptq_wait_time =
+ tp->acceptq[i].aq_wait_time/(HZ/USER_HZ);
+ tinfo[i].acceptq_qcount = tp->acceptq[i].aq_qcount;
+ tinfo[i].acceptq_count = tp->acceptq[i].aq_count;
+ if (tp->acceptq[i].aq_valid)
+ tinfo[i].acceptq_shares=tp->acceptq[i].aq_ratio;
+ else
+ tinfo[i].acceptq_shares = 0;
+ }
+
+ len = min_t(unsigned int, len, sizeof(tinfo));
+ if (put_user(len, optlen))
+ return -EFAULT;
+
+ if (copy_to_user(optval, (char *)tinfo, len))
+ return -EFAULT;
+
+ return 0;
+ }
+#endif
default:
return -ENOPROTOOPT;
};
head = &tcp_listening_hash[tcp_lhashfn(hnum)];
if (!hlist_empty(head)) {
struct inet_opt *inet = inet_sk((sk = __sk_head(head)));
-
if (inet->num == hnum && !sk->sk_node.next &&
(!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
(sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
lopt->syn_table[h] = req;
write_unlock(&tp->syn_wait_lock);
+#ifdef CONFIG_ACCEPT_QUEUES
+ tcp_synq_added(sk, req);
+#else
tcp_synq_added(sk);
+#endif
}
__u32 daddr = skb->nh.iph->daddr;
__u32 isn = TCP_SKB_CB(skb)->when;
struct dst_entry *dst = NULL;
+#ifdef CONFIG_ACCEPT_QUEUES
+ int class = 0;
+#endif
#ifdef CONFIG_SYN_COOKIES
int want_cookie = 0;
#else
goto drop;
}
+#ifdef CONFIG_ACCEPT_QUEUES
+ class = (skb->nfmark <= 0) ? 0 :
+ ((skb->nfmark >= NUM_ACCEPT_QUEUES) ? 0: skb->nfmark);
+ /*
+ * Accept only if the class has shares set or if the default class
+ * i.e. class 0 has shares
+ */
+ if (!(tcp_sk(sk)->acceptq[class].aq_valid)) {
+ if (tcp_sk(sk)->acceptq[0].aq_valid)
+ class = 0;
+ else
+ goto drop;
+ }
+#endif
+
/* Accept backlog is full. If we have already queued enough
* of warm entries in syn queue, drop request. It is better than
* clogging syn queue with openreqs with exponentially increasing
* timeout.
*/
+#ifdef CONFIG_ACCEPT_QUEUES
+ if (tcp_acceptq_is_full(sk, class) && tcp_synq_young(sk, class) > 1)
+#else
if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
+#endif
goto drop;
req = tcp_openreq_alloc();
tp.tstamp_ok = tp.saw_tstamp;
tcp_openreq_init(req, &tp, skb);
-
+#ifdef CONFIG_ACCEPT_QUEUES
+ req->acceptq_class = class;
+ req->acceptq_time_stamp = jiffies;
+#endif
req->af.v4_req.loc_addr = daddr;
req->af.v4_req.rmt_addr = saddr;
req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
struct tcp_opt *newtp;
struct sock *newsk;
+#ifdef CONFIG_ACCEPT_QUEUES
+ if (tcp_acceptq_is_full(sk, req->acceptq_class))
+#else
if (tcp_acceptq_is_full(sk))
+#endif
goto exit_overflow;
if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
newtp->num_sacks = 0;
newtp->urg_data = 0;
newtp->listen_opt = NULL;
+#ifdef CONFIG_ACCEPT_QUEUES
+ newtp->accept_queue = NULL;
+ memset(newtp->acceptq, 0,sizeof(newtp->acceptq));
+ newtp->class_index = 0;
+
+#else
newtp->accept_queue = newtp->accept_queue_tail = NULL;
+#endif
/* Deinitialize syn_wait_lock to trap illegal accesses. */
memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock));
* ones are about to clog our table.
*/
if (lopt->qlen>>(lopt->max_qlen_log-1)) {
+#ifdef CONFIG_ACCEPT_QUEUES
+ int young = 0;
+
+ for(i=0; i < NUM_ACCEPT_QUEUES; i++)
+ young += lopt->qlen_young[i];
+
+ young <<= 1;
+#else
int young = (lopt->qlen_young<<1);
+#endif
while (thresh > 2) {
if (lopt->qlen < young)
unsigned long timeo;
if (req->retrans++ == 0)
- lopt->qlen_young--;
- timeo = min((TCP_TIMEOUT_INIT << req->retrans),
- TCP_RTO_MAX);
+#ifdef CONFIG_ACCEPT_QUEUES
+ lopt->qlen_young[req->acceptq_class]--;
+#else
+ lopt->qlen_young--;
+#endif
+ timeo = min((TCP_TIMEOUT_INIT << req->retrans), TCP_RTO_MAX);
req->expires = now + timeo;
reqp = &req->dl_next;
continue;
write_unlock(&tp->syn_wait_lock);
lopt->qlen--;
if (req->retrans == 0)
- lopt->qlen_young--;
+#ifdef CONFIG_ACCEPT_QUEUES
+ lopt->qlen_young[req->acceptq_class]--;
+#else
+ lopt->qlen_young--;
+#endif
tcp_openreq_free(req);
continue;
}
lopt->syn_table[h] = req;
write_unlock(&tp->syn_wait_lock);
+#ifdef CONFIG_ACCEPT_QUEUES
+ tcp_synq_added(sk, req);
+#else
tcp_synq_added(sk);
+#endif
}
struct tcp_opt tmptp, *tp = tcp_sk(sk);
struct open_request *req = NULL;
__u32 isn = TCP_SKB_CB(skb)->when;
+#ifdef CONFIG_ACCEPT_QUEUES
+ int class = 0;
+#endif
if (skb->protocol == htons(ETH_P_IP))
return tcp_v4_conn_request(sk, skb);
if (!ipv6_unicast_destination(skb))
goto drop;
+
/*
* There are no SYN attacks on IPv6, yet...
*/
goto drop;
}
+#ifdef CONFIG_ACCEPT_QUEUES
+ class = (skb->nfmark <= 0) ? 0 :
+ ((skb->nfmark >= NUM_ACCEPT_QUEUES) ? 0: skb->nfmark);
+ /*
+ * Accept only if the class has shares set or if the default class
+ * i.e. class 0 has shares
+ */
+ if (!(tcp_sk(sk)->acceptq[class].aq_valid)) {
+ if (tcp_sk(sk)->acceptq[0].aq_valid)
+ class = 0;
+ else
+ goto drop;
+ }
+#endif
+
+ /* Accept backlog is full. If we have already queued enough
+ * of warm entries in syn queue, drop request. It is better than
+ * clogging syn queue with openreqs with exponentially increasing
+ * timeout.
+ */
+#ifdef CONFIG_ACCEPT_QUEUES
+ if (tcp_acceptq_is_full(sk, class) && tcp_synq_young(sk, class) > 1)
+#else
if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
+#endif
goto drop;
+
req = tcp_openreq_alloc();
if (req == NULL)
goto drop;
tmptp.tstamp_ok = tmptp.saw_tstamp;
tcp_openreq_init(req, &tmptp, skb);
-
+#ifdef CONFIG_ACCEPT_QUEUES
+ req->acceptq_class = class;
+ req->acceptq_time_stamp = jiffies;
+#endif
req->class = &or_ipv6;
ipv6_addr_copy(&req->af.v6_req.rmt_addr, &skb->nh.ipv6h->saddr);
ipv6_addr_copy(&req->af.v6_req.loc_addr, &skb->nh.ipv6h->daddr);
opt = np->opt;
+#ifdef CONFIG_ACCEPT_QUEUES
+ if (tcp_acceptq_is_full(sk, req->acceptq_class))
+#else
if (tcp_acceptq_is_full(sk))
+#endif
goto out_overflow;
if (np->rxopt.bits.srcrt == 2 &&