fedora core 6 1.2949 + vserver 2.2.0
[linux-2.6.git] / drivers / cpufreq / cpufreq_conservative.c
index ac38766..eef0270 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/types.h>
 #include <linux/fs.h>
 #include <linux/sysfs.h>
+#include <linux/cpu.h>
 #include <linux/sched.h>
 #include <linux/kmod.h>
 #include <linux/workqueue.h>
  */
 
 #define DEF_FREQUENCY_UP_THRESHOLD             (80)
-#define MIN_FREQUENCY_UP_THRESHOLD             (0)
-#define MAX_FREQUENCY_UP_THRESHOLD             (100)
-
 #define DEF_FREQUENCY_DOWN_THRESHOLD           (20)
-#define MIN_FREQUENCY_DOWN_THRESHOLD           (0)
-#define MAX_FREQUENCY_DOWN_THRESHOLD           (100)
 
 /* 
  * The polling frequency of this governor depends on the capability of 
  * latency of the processor. The governor will work on any processor with 
  * transition latency <= 10mS, using appropriate sampling 
  * rate.
- * For CPUs with transition latency > 10mS (mostly drivers with CPUFREQ_ETERNAL)
- * this governor will not work.
+ * For CPUs with transition latency > 10mS (mostly drivers
+ * with CPUFREQ_ETERNAL), this governor will not work.
  * All times here are in uS.
  */
 static unsigned int                            def_sampling_rate;
-#define MIN_SAMPLING_RATE                      (def_sampling_rate / 2)
+#define MIN_SAMPLING_RATE_RATIO                        (2)
+/* for correct statistics, we need at least 10 ticks between each measure */
+#define MIN_STAT_SAMPLING_RATE                 \
+                       (MIN_SAMPLING_RATE_RATIO * jiffies_to_usecs(10))
+#define MIN_SAMPLING_RATE                      \
+                       (def_sampling_rate / MIN_SAMPLING_RATE_RATIO)
 #define MAX_SAMPLING_RATE                      (500 * def_sampling_rate)
-#define DEF_SAMPLING_RATE_LATENCY_MULTIPLIER   (100000)
-#define DEF_SAMPLING_DOWN_FACTOR               (5)
+#define DEF_SAMPLING_RATE_LATENCY_MULTIPLIER   (1000)
+#define DEF_SAMPLING_DOWN_FACTOR               (1)
+#define MAX_SAMPLING_DOWN_FACTOR               (10)
 #define TRANSITION_LATENCY_LIMIT               (10 * 1000)
 
-static void do_dbs_timer(void *data);
+static void do_dbs_timer(struct work_struct *work);
 
 struct cpu_dbs_info_s {
        struct cpufreq_policy   *cur_policy;
        unsigned int            prev_cpu_idle_up;
        unsigned int            prev_cpu_idle_down;
        unsigned int            enable;
+       unsigned int            down_skip;
+       unsigned int            requested_freq;
 };
 static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info);
 
 static unsigned int dbs_enable;        /* number of CPUs using this policy */
 
+/*
+ * DEADLOCK ALERT! There is a ordering requirement between cpu_hotplug
+ * lock and dbs_mutex. cpu_hotplug lock should always be held before
+ * dbs_mutex. If any function that can potentially take cpu_hotplug lock
+ * (like __cpufreq_driver_target()) is being called with dbs_mutex taken, then
+ * cpu_hotplug lock should be taken before that. Note that cpu_hotplug lock
+ * is recursive for the same process. -Venki
+ */
 static DEFINE_MUTEX    (dbs_mutex);
-static DECLARE_WORK    (dbs_work, do_dbs_timer, NULL);
+static DECLARE_DELAYED_WORK(dbs_work, do_dbs_timer);
 
 struct dbs_tuners {
        unsigned int            sampling_rate;
@@ -87,15 +99,22 @@ static struct dbs_tuners dbs_tuners_ins = {
        .up_threshold           = DEF_FREQUENCY_UP_THRESHOLD,
        .down_threshold         = DEF_FREQUENCY_DOWN_THRESHOLD,
        .sampling_down_factor   = DEF_SAMPLING_DOWN_FACTOR,
+       .ignore_nice            = 0,
+       .freq_step              = 5,
 };
 
 static inline unsigned int get_cpu_idle_time(unsigned int cpu)
 {
-       return  kstat_cpu(cpu).cpustat.idle +
+       unsigned int add_nice = 0, ret;
+
+       if (dbs_tuners_ins.ignore_nice)
+               add_nice = kstat_cpu(cpu).cpustat.nice;
+
+       ret =   kstat_cpu(cpu).cpustat.idle +
                kstat_cpu(cpu).cpustat.iowait +
-               ( dbs_tuners_ins.ignore_nice ?
-                 kstat_cpu(cpu).cpustat.nice :
-                 0);
+               add_nice;
+
+       return ret;
 }
 
 /************************** sysfs interface ************************/
@@ -136,7 +155,7 @@ static ssize_t store_sampling_down_factor(struct cpufreq_policy *unused,
        unsigned int input;
        int ret;
        ret = sscanf (buf, "%u", &input);
-       if (ret != 1 )
+       if (ret != 1 || input > MAX_SAMPLING_DOWN_FACTOR || input < 1)
                return -EINVAL;
 
        mutex_lock(&dbs_mutex);
@@ -173,9 +192,7 @@ static ssize_t store_up_threshold(struct cpufreq_policy *unused,
        ret = sscanf (buf, "%u", &input);
 
        mutex_lock(&dbs_mutex);
-       if (ret != 1 || input > MAX_FREQUENCY_UP_THRESHOLD || 
-                       input < MIN_FREQUENCY_UP_THRESHOLD ||
-                       input <= dbs_tuners_ins.down_threshold) {
+       if (ret != 1 || input > 100 || input <= dbs_tuners_ins.down_threshold) {
                mutex_unlock(&dbs_mutex);
                return -EINVAL;
        }
@@ -194,9 +211,7 @@ static ssize_t store_down_threshold(struct cpufreq_policy *unused,
        ret = sscanf (buf, "%u", &input);
 
        mutex_lock(&dbs_mutex);
-       if (ret != 1 || input > MAX_FREQUENCY_DOWN_THRESHOLD || 
-                       input < MIN_FREQUENCY_DOWN_THRESHOLD ||
-                       input >= dbs_tuners_ins.up_threshold) {
+       if (ret != 1 || input > 100 || input >= dbs_tuners_ins.up_threshold) {
                mutex_unlock(&dbs_mutex);
                return -EINVAL;
        }
@@ -297,31 +312,17 @@ static struct attribute_group dbs_attr_group = {
 static void dbs_check_cpu(int cpu)
 {
        unsigned int idle_ticks, up_idle_ticks, down_idle_ticks;
+       unsigned int tmp_idle_ticks, total_idle_ticks;
        unsigned int freq_step;
        unsigned int freq_down_sampling_rate;
-       static int down_skip[NR_CPUS];
-       static int requested_freq[NR_CPUS];
-       static unsigned short init_flag = 0;
-       struct cpu_dbs_info_s *this_dbs_info;
-       struct cpu_dbs_info_s *dbs_info;
-
+       struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cpu_dbs_info, cpu);
        struct cpufreq_policy *policy;
-       unsigned int j;
 
-       this_dbs_info = &per_cpu(cpu_dbs_info, cpu);
        if (!this_dbs_info->enable)
                return;
 
        policy = this_dbs_info->cur_policy;
 
-       if ( init_flag == 0 ) {
-               for_each_online_cpu(j) {
-                       dbs_info = &per_cpu(cpu_dbs_info, j);
-                       requested_freq[j] = dbs_info->cur_policy->cur;
-               }
-               init_flag = 1;
-       }
-       
        /* 
         * The default safe range is 20% to 80% 
         * Every sampling_rate, we check
@@ -337,39 +338,29 @@ static void dbs_check_cpu(int cpu)
         */
 
        /* Check for frequency increase */
-
        idle_ticks = UINT_MAX;
-       for_each_cpu_mask(j, policy->cpus) {
-               unsigned int tmp_idle_ticks, total_idle_ticks;
-               struct cpu_dbs_info_s *j_dbs_info;
 
-               j_dbs_info = &per_cpu(cpu_dbs_info, j);
-               /* Check for frequency increase */
-               total_idle_ticks = get_cpu_idle_time(j);
-               tmp_idle_ticks = total_idle_ticks -
-                       j_dbs_info->prev_cpu_idle_up;
-               j_dbs_info->prev_cpu_idle_up = total_idle_ticks;
-
-               if (tmp_idle_ticks < idle_ticks)
-                       idle_ticks = tmp_idle_ticks;
-       }
+       /* Check for frequency increase */
+       total_idle_ticks = get_cpu_idle_time(cpu);
+       tmp_idle_ticks = total_idle_ticks -
+               this_dbs_info->prev_cpu_idle_up;
+       this_dbs_info->prev_cpu_idle_up = total_idle_ticks;
+
+       if (tmp_idle_ticks < idle_ticks)
+               idle_ticks = tmp_idle_ticks;
 
        /* Scale idle ticks by 100 and compare with up and down ticks */
        idle_ticks *= 100;
        up_idle_ticks = (100 - dbs_tuners_ins.up_threshold) *
-               usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
+                       usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
 
        if (idle_ticks < up_idle_ticks) {
-               down_skip[cpu] = 0;
-               for_each_cpu_mask(j, policy->cpus) {
-                       struct cpu_dbs_info_s *j_dbs_info;
+               this_dbs_info->down_skip = 0;
+               this_dbs_info->prev_cpu_idle_down =
+                       this_dbs_info->prev_cpu_idle_up;
 
-                       j_dbs_info = &per_cpu(cpu_dbs_info, j);
-                       j_dbs_info->prev_cpu_idle_down = 
-                                       j_dbs_info->prev_cpu_idle_up;
-               }
                /* if we are already at full speed then break out early */
-               if (requested_freq[cpu] == policy->max)
+               if (this_dbs_info->requested_freq == policy->max)
                        return;
                
                freq_step = (dbs_tuners_ins.freq_step * policy->max) / 100;
@@ -378,49 +369,45 @@ static void dbs_check_cpu(int cpu)
                if (unlikely(freq_step == 0))
                        freq_step = 5;
                
-               requested_freq[cpu] += freq_step;
-               if (requested_freq[cpu] > policy->max)
-                       requested_freq[cpu] = policy->max;
+               this_dbs_info->requested_freq += freq_step;
+               if (this_dbs_info->requested_freq > policy->max)
+                       this_dbs_info->requested_freq = policy->max;
 
-               __cpufreq_driver_target(policy, requested_freq[cpu], 
+               __cpufreq_driver_target(policy, this_dbs_info->requested_freq,
                        CPUFREQ_RELATION_H);
                return;
        }
 
        /* Check for frequency decrease */
-       down_skip[cpu]++;
-       if (down_skip[cpu] < dbs_tuners_ins.sampling_down_factor)
+       this_dbs_info->down_skip++;
+       if (this_dbs_info->down_skip < dbs_tuners_ins.sampling_down_factor)
                return;
 
-       idle_ticks = UINT_MAX;
-       for_each_cpu_mask(j, policy->cpus) {
-               unsigned int tmp_idle_ticks, total_idle_ticks;
-               struct cpu_dbs_info_s *j_dbs_info;
-
-               j_dbs_info = &per_cpu(cpu_dbs_info, j);
-               total_idle_ticks = j_dbs_info->prev_cpu_idle_up;
-               tmp_idle_ticks = total_idle_ticks -
-                       j_dbs_info->prev_cpu_idle_down;
-               j_dbs_info->prev_cpu_idle_down = total_idle_ticks;
+       /* Check for frequency decrease */
+       total_idle_ticks = this_dbs_info->prev_cpu_idle_up;
+       tmp_idle_ticks = total_idle_ticks -
+               this_dbs_info->prev_cpu_idle_down;
+       this_dbs_info->prev_cpu_idle_down = total_idle_ticks;
 
-               if (tmp_idle_ticks < idle_ticks)
-                       idle_ticks = tmp_idle_ticks;
-       }
+       if (tmp_idle_ticks < idle_ticks)
+               idle_ticks = tmp_idle_ticks;
 
        /* Scale idle ticks by 100 and compare with up and down ticks */
        idle_ticks *= 100;
-       down_skip[cpu] = 0;
+       this_dbs_info->down_skip = 0;
 
        freq_down_sampling_rate = dbs_tuners_ins.sampling_rate *
                dbs_tuners_ins.sampling_down_factor;
        down_idle_ticks = (100 - dbs_tuners_ins.down_threshold) *
-                       usecs_to_jiffies(freq_down_sampling_rate);
+               usecs_to_jiffies(freq_down_sampling_rate);
 
        if (idle_ticks > down_idle_ticks) {
-               /* if we are already at the lowest speed then break out early
+               /*
+                * if we are already at the lowest speed then break out early
                 * or if we 'cannot' reduce the speed as the user might want
-                * freq_step to be zero */
-               if (requested_freq[cpu] == policy->min
+                * freq_step to be zero
+                */
+               if (this_dbs_info->requested_freq == policy->min
                                || dbs_tuners_ins.freq_step == 0)
                        return;
 
@@ -430,31 +417,31 @@ static void dbs_check_cpu(int cpu)
                if (unlikely(freq_step == 0))
                        freq_step = 5;
 
-               requested_freq[cpu] -= freq_step;
-               if (requested_freq[cpu] < policy->min)
-                       requested_freq[cpu] = policy->min;
+               this_dbs_info->requested_freq -= freq_step;
+               if (this_dbs_info->requested_freq < policy->min)
+                       this_dbs_info->requested_freq = policy->min;
 
-               __cpufreq_driver_target(policy,
-                       requested_freq[cpu],
-                       CPUFREQ_RELATION_H);
+               __cpufreq_driver_target(policy, this_dbs_info->requested_freq,
+                               CPUFREQ_RELATION_H);
                return;
        }
 }
 
-static void do_dbs_timer(void *data)
+static void do_dbs_timer(struct work_struct *work)
 { 
        int i;
+       lock_cpu_hotplug();
        mutex_lock(&dbs_mutex);
        for_each_online_cpu(i)
                dbs_check_cpu(i);
        schedule_delayed_work(&dbs_work, 
                        usecs_to_jiffies(dbs_tuners_ins.sampling_rate));
        mutex_unlock(&dbs_mutex);
+       unlock_cpu_hotplug();
 } 
 
 static inline void dbs_timer_init(void)
 {
-       INIT_WORK(&dbs_work, do_dbs_timer, NULL);
        schedule_delayed_work(&dbs_work,
                        usecs_to_jiffies(dbs_tuners_ins.sampling_rate));
        return;
@@ -472,6 +459,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
        unsigned int cpu = policy->cpu;
        struct cpu_dbs_info_s *this_dbs_info;
        unsigned int j;
+       int rc;
 
        this_dbs_info = &per_cpu(cpu_dbs_info, cpu);
 
@@ -488,17 +476,26 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
                        break;
                 
                mutex_lock(&dbs_mutex);
+
+               rc = sysfs_create_group(&policy->kobj, &dbs_attr_group);
+               if (rc) {
+                       mutex_unlock(&dbs_mutex);
+                       return rc;
+               }
+
                for_each_cpu_mask(j, policy->cpus) {
                        struct cpu_dbs_info_s *j_dbs_info;
                        j_dbs_info = &per_cpu(cpu_dbs_info, j);
                        j_dbs_info->cur_policy = policy;
                
-                       j_dbs_info->prev_cpu_idle_up = get_cpu_idle_time(j);
+                       j_dbs_info->prev_cpu_idle_up = get_cpu_idle_time(cpu);
                        j_dbs_info->prev_cpu_idle_down
                                = j_dbs_info->prev_cpu_idle_up;
                }
                this_dbs_info->enable = 1;
-               sysfs_create_group(&policy->kobj, &dbs_attr_group);
+               this_dbs_info->down_skip = 0;
+               this_dbs_info->requested_freq = policy->cur;
+
                dbs_enable++;
                /*
                 * Start the timerschedule work, when this governor
@@ -507,16 +504,17 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
                if (dbs_enable == 1) {
                        unsigned int latency;
                        /* policy latency is in nS. Convert it to uS first */
+                       latency = policy->cpuinfo.transition_latency / 1000;
+                       if (latency == 0)
+                               latency = 1;
 
-                       latency = policy->cpuinfo.transition_latency;
-                       if (latency < 1000)
-                               latency = 1000;
-
-                       def_sampling_rate = (latency / 1000) *
+                       def_sampling_rate = 10 * latency *
                                        DEF_SAMPLING_RATE_LATENCY_MULTIPLIER;
+
+                       if (def_sampling_rate < MIN_STAT_SAMPLING_RATE)
+                               def_sampling_rate = MIN_STAT_SAMPLING_RATE;
+
                        dbs_tuners_ins.sampling_rate = def_sampling_rate;
-                       dbs_tuners_ins.ignore_nice = 0;
-                       dbs_tuners_ins.freq_step = 5;
 
                        dbs_timer_init();
                }